From bc06079664fb20b4f65d8dca72b6e71462fbd419 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Thu, 20 Jun 2024 16:32:40 +0200 Subject: [PATCH] OBC Branch Merge --- src/Dirac/Dirac.jl | 595 +-------------------------------- src/Dirac/Diracfields.jl | 211 ++++++++++++ src/Dirac/Diracflow.jl | 281 +++++++++------- src/Dirac/Diracoper.jl | 664 +++++++++++++++++++++++++++++++++++++ src/YM/YMact.jl | 694 +++++++++++++++++++++++++++++++++++---- src/YM/YMfields.jl | 47 ++- src/YM/YMflow.jl | 42 ++- 7 files changed, 1726 insertions(+), 808 deletions(-) create mode 100644 src/Dirac/Diracfields.jl create mode 100644 src/Dirac/Diracoper.jl diff --git a/src/Dirac/Dirac.jl b/src/Dirac/Dirac.jl index c378419..c11dc74 100644 --- a/src/Dirac/Dirac.jl +++ b/src/Dirac/Dirac.jl @@ -105,500 +105,6 @@ struct DiracWorkspace{T} end -export DiracWorkspace, DiracParam - - -""" - function Csw!(dws, U, gp, lp::SpaceParm) - -Computes the clover and stores it in dws.csw. - -""" -function Csw!(dws, U, gp, lp::SpaceParm{4,6,B,D}) where {B,D} - - @timeit "Csw computation" begin - - for i in 1:Int(lp.npls) - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_csw!(dws.csw, U, gp.Ubnd, i, lp) - end - end - end - - return nothing -end - -function krnl_csw!(csw::AbstractArray{T}, U, Ubnd, ipl, lp::SpaceParm{4,M,B,D}) where {T,M,B,D} - - @inbounds begin - b = Int64(CUDA.threadIdx().x) - r = Int64(CUDA.blockIdx().x) - I = point_coord((b,r), lp) - it = I[4] - - id1, id2 = lp.plidx[ipl] - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) - - bu1, ru1 = up((b, r), id1, lp) - bu2, ru2 = up((b, r), id2, lp) - bd1, rd1 = dw((b, r), id1, lp) - bd2, rd2 = dw((b, r), id2, lp) - bdd, rdd = dw((bd1, rd1), id2, lp) - bud, rud = dw((bu1, ru1), id2, lp) - bdu, rdu = up((bd1, rd1), id2, lp) - - if SFBC && (it == lp.iL[end]) - gt1 = Ubnd[id2] - gt2 = Ubnd[id2] - else - gt1 = U[bu1,id2,ru1] - gt2 = U[bud,id2,rud] - end - - M1 = U[b,id1,r]*gt1/(U[b,id2,r]*U[bu2,id1,ru2]) - M2 = (U[bd2,id2,rd2]\(U[bd2,id1,rd2]*gt2))/U[b,id1,r] - M3 = (U[bdd,id2,rdd]*U[bd1,id1,rd1])\(U[bdd,id1,rdd]*U[bd2,id2,rd2]) - M4 = (U[b,id2,r]/(U[bd1,id2,rd1]*U[bdu,id1,rdu]))*U[bd1,id1,rd1] - - - if !(SFBC && (it == 1)) - csw[b,ipl,r] = 0.125*(antsym(M1)+antsym(M2)+antsym(M3)+antsym(M4)) - end - - end - - return nothing -end - - -""" - function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Computes the Dirac operator (with the Wilson term) `\`\``D_w``\`\` with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. -If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. - -""" -function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) - end - end - else - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) - end - end - end - - return nothing -end - -function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r]+ im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - end - - return nothing -end - -function krnl_Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - end - - return nothing -end - -function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - else - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) - end - end - end - - return nothing -end - -function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - return nothing -end - -function krnl_Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - return nothing -end - -""" - function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Computes \`\` \\gamma_5 \`\` times the Dirac operator (with the Wilson term) with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. -If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. -""" -function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) - end - end - else - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) - end - end - end - - return nothing -end - -function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] - end - - return nothing -end - -function krnl_g5Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] - end - - return nothing -end - -function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - else - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) - end - end - end - - return nothing -end - -function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] - - return nothing -end - -function krnl_g5Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] - - return nothing -end - -""" - function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Applies the operator \`\` \\gamma_5 D_w \`\` twice to `si` and stores the result in `so`. This is equivalent to appling the operator \`\` D_w^\\dagger D_w \`\` -The Dirac operator is the same as in the functions `Dw!` and `g5Dw!` -""" -function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - SF_bndfix!(dws.st,lp) - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - SF_bndfix!(so,lp) - end - else - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) - end - end - SF_bndfix!(dws.st,lp) - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, dpar.ct, lp) - end - end - SF_bndfix!(so,lp) - end - end - - return nothing -end - -function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) - end - end - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, lp) - end - end - end - else - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, lp) - end - end - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, lp) - end - end - end - end - - return nothing -end """ function mtwmdpar(dpar::DiracParam) @@ -610,108 +116,19 @@ function mtwmdpar(dpar::DiracParam{P,R}) where {P,R} end -""" - SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) +export DiracWorkspace, DiracParam, mtwmdpar -Sets all the values of `sp` in the first time slice to zero. -""" -function SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - @timeit "SF boundary fix" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_sfbndfix!(sp, lp) - end - end - return nothing -end - -function krnl_sfbndfix!(sp,lp::SpaceParm) - b=Int64(CUDA.threadIdx().x) - r=Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) == 1) - sp[b,r] = 0.0*sp[b,r] - end - return nothing -end - - -""" - function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund / SU2fund {T}}}, lp::SpaceParm, t::Int64 = 0) - -Randomizes the SU2fund / SU3fund fermion field. If the argument t is present, it only randomizes that time-slice. -""" -function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm, t::Int64 = 0) where {T} - - @timeit "Randomize pseudofermion field" begin - p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) - end - end - - return nothing -end - -function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64) - - @inbounds begin - b = Int64(CUDA.threadIdx().x) - r = Int64(CUDA.blockIdx().x) - - if t == 0 - f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2], - x[b,3,r,1] + im* x[b,3,r,2]),p)) - elseif point_time((b,r),lp) == t - f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2], - x[b,3,r,1] + im* x[b,3,r,2]),p)) - end - - end - - return nothing -end - -function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}},lp::SpaceParm, t::Int64=0) where {T} - - @timeit "Randomize pseudofermion field" begin - p = ntuple(i->CUDA.randn(T, lp.bsz, 2, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su2!(f,p,lp,t) - end - end - - return nothing -end - -function krnl_assign_pf_su2!(f::AbstractArray, p , lp::SpaceParm, t::Int64) - - @inbounds begin - b = Int64(CUDA.threadIdx().x) - r = Int64(CUDA.blockIdx().x) - - if t == 0 - f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2]),p)) - elseif point_time((b,r),lp) == t - f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2]),p)) - end - - end - - return nothing -end - -export Dw!, g5Dw!, DwdagDw!, SF_bndfix!, Csw!, pfrandomize!, mtwmdpar +include("Diracfields.jl") +export SF_bndfix!, Csw!, pfrandomize! +include("Diracoper.jl") +export Dw!, g5Dw!, DwdagDw! include("DiracIO.jl") export read_prop, save_prop, read_dpar include("Diracflow.jl") -export Dslash_sq!, flw, backflow +export Nablanabla!, Dslash_sq!, flw, backflow end diff --git a/src/Dirac/Diracfields.jl b/src/Dirac/Diracfields.jl new file mode 100644 index 0000000..5559ca3 --- /dev/null +++ b/src/Dirac/Diracfields.jl @@ -0,0 +1,211 @@ + + + +""" + function Csw!(dws, U, gp, lp::SpaceParm) + +Computes the clover and stores it in dws.csw. + +""" +function Csw!(dws, U, gp, lp::SpaceParm{4,6,B,D}) where {B,D} + + @timeit "Csw computation" begin + + for i in 1:Int(lp.npls) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_csw!(dws.csw, U, gp.Ubnd, i, lp) + end + end + end + + return nothing +end + +function krnl_csw!(csw::AbstractArray{T}, U, Ubnd, ipl, lp::SpaceParm{4,M,B,D}) where {T,M,B,D} + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[4] + + id1, id2 = lp.plidx[ipl] + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + OBC = (B == BC_OPEN) && ((it == 1) || (it == lp.iL[end])) + + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + bd1, rd1 = dw((b, r), id1, lp) + bd2, rd2 = dw((b, r), id2, lp) + bdd, rdd = dw((bd1, rd1), id2, lp) + bud, rud = dw((bu1, ru1), id2, lp) + bdu, rdu = up((bd1, rd1), id2, lp) + + if SFBC && (it == lp.iL[end]) + gt1 = Ubnd[id2] + gt2 = Ubnd[id2] + else + gt1 = U[bu1,id2,ru1] + gt2 = U[bud,id2,rud] + end + + M1 = U[b,id1,r]*gt1/(U[b,id2,r]*U[bu2,id1,ru2]) + M2 = (U[bd2,id2,rd2]\(U[bd2,id1,rd2]*gt2))/U[b,id1,r] + M3 = (U[bdd,id2,rdd]*U[bd1,id1,rd1])\(U[bdd,id1,rdd]*U[bd2,id2,rd2]) + M4 = (U[b,id2,r]/(U[bd1,id2,rd1]*U[bdu,id1,rdu]))*U[bd1,id1,rd1] + + + if !(SFBC && (it == 1)) && !OBC + csw[b,ipl,r] = 0.125*(antsym(M1)+antsym(M2)+antsym(M3)+antsym(M4)) + end + + end + + return nothing +end + + + +""" + SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) + +Sets all the values of `sp` in the first time slice to zero. +""" +function SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + @timeit "SF boundary fix" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_sfbndfix!(sp, lp) + end + end + return nothing +end + +function krnl_sfbndfix!(sp,lp::SpaceParm) + b=Int64(CUDA.threadIdx().x) + r=Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) == 1) + sp[b,r] = 0.0*sp[b,r] + end + return nothing +end + +""" + SF_bndfix!(sp, lp::SpaceParm{4,6,BC_OPEN,D}) + +Sets all the values of `sp` in the first and last time slice to zero. +""" +function SF_bndfix!(sp, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + @timeit "SF boundary fix" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_opbndfix!(sp, lp) + end + end + return nothing +end + +function krnl_opbndfix!(sp,lp::SpaceParm) + b=Int64(CUDA.threadIdx().x) + r=Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) == 1) || (point_time((b,r),lp) == lp.iL[end])) + sp[b,r] = 0.0*sp[b,r] + end + return nothing +end + + +""" + function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund / SU2fund {T}}}, lp::SpaceParm, t::Int64 = 0) + +Randomizes the SU2fund / SU3fund fermion field. If the argument t is present, it only randomizes that time-slice. +""" +function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm{4,6,BC_PERIODIC,D}, t::Int64 = 0) where {T,D} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) + end + end + + return nothing +end + +function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}, t::Int64 = 0) where {T,D} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) + end + end + SF_bndfix!(f,lp) + + return nothing +end + +function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64) + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + + if t == 0 + f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2], + x[b,3,r,1] + im* x[b,3,r,2]),p)) + elseif point_time((b,r),lp) == t + f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2], + x[b,3,r,1] + im* x[b,3,r,2]),p)) + end + + end + + return nothing +end + +function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}}, lp::SpaceParm{4,6,BC_PERIODIC,D}, t::Int64 = 0) where {T,D} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su2!(f,p,lp,t) + end + end + + return nothing +end + +function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}}, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}, t::Int64 = 0) where {T,D} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su2!(f,p,lp,t) + end + end + SF_bndfix!(f,lp) + + return nothing +end + +function krnl_assign_pf_su2!(f::AbstractArray, p , lp::SpaceParm, t::Int64) + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + + if t == 0 + f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2]),p)) + elseif point_time((b,r),lp) == t + f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2]),p)) + end + + end + + return nothing +end diff --git a/src/Dirac/Diracflow.jl b/src/Dirac/Diracflow.jl index c3706c3..8065840 100644 --- a/src/Dirac/Diracflow.jl +++ b/src/Dirac/Diracflow.jl @@ -30,7 +30,7 @@ function flw(U, psi, int::FlowIntr{NI,T}, ns::Int64, eps, gp::GaugeParm, dpar::D ymws.mom .= int.e0[k].*ymws.mom .+ int.e1[k].*ymws.frc1 U .= expm.(U, ymws.mom, 2*eps) - end + end end end @@ -86,7 +86,7 @@ function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, @timeit "CPU to GPU" copyto!(U,U0) for j in dsave:-1:1 - @timeit "CPU to GPU" copyto!(U,U0) + @timeit "CPU to GPU" copyto!(U,U0) for k in 1:j-1 flw(U, int, 1, eps_all[k], gp, lp, ymws) end @@ -154,83 +154,6 @@ function bflw_step!(psi, U, eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam return nothing end -""" - - function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `si`. - -""" -function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - @timeit "Laplacian" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) - end - end - - return nothing -end - - - -function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - @inbounds begin - - so[b,r] = -4*si[b,r] - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + - th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + - th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + - th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) - end - - return nothing -end - - -function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - @inbounds begin - - if (point_time((b,r),lp) != 1) - - so[b,r] = -4*si[b,r] - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + - th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + - th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + - th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) - end - end - - return nothing -end - - function flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, epsini::T, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) where {NI,T} @@ -278,13 +201,123 @@ end flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) where {NI,T} = flw_adapt(U, psi, int, tend, int.eps_ini, gp, dpar, lp, ymws, dws) +""" + + function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `si`. + +""" +function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + @timeit "Laplacian" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) + end + end + return nothing +end +function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}) where {D} + SF_bndfix!(si,lp) + @timeit "Laplacian" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) + end + end + SF_bndfix!(so,lp) + return nothing +end + + +function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + @inbounds begin + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + + so[b,r] = -4*si[b,r] + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + end + end + + return nothing +end + +function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + @inbounds begin + + so[b,r] = -4*si[b,r] + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + end + + return nothing +end + +function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + @inbounds begin + + if (point_time((b,r),lp) != 1) + + so[b,r] = -4*si[b,r] + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + end + end + + return nothing +end + export Nablanabla!, flw, backflow, flw_adapt, bflw_step! """ - function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) Computes /`/` //slashed{D}^2 si /`/` ans stores it in `si`. @@ -292,40 +325,40 @@ Computes /`/` //slashed{D}^2 si /`/` ans stores it in `si`. """ function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - @timeit "DwdagDw" begin + @timeit "DwdagDw" begin - @timeit "g5Dslsh" begin + @timeit "g5Dslsh" begin CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh!(dws.st, U, si, dpar.th, lp) end - end + end - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw_improvement" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(dws.st, dws.csw, dpar.csw, si, lp) - end + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw_improvement" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(dws.st, dws.csw, dpar.csw, si, lp) end end + end - @timeit "g5Dslsh" begin + @timeit "g5Dslsh" begin CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh!(so, U, dws.st, dpar.th, lp) end - end + end - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw_improvement" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(so, dws.csw, dpar.csw, dws.st, lp) - end + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw_improvement" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(so, dws.csw, dpar.csw, dws.st, lp) end end - - end + + end + return nothing end @@ -349,12 +382,12 @@ function krnl_g5Dslsh!(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Spac bu4, ru4 = up((b,r), 4, lp) bd4, rd4 = dw((b,r), 4, lp) - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - so[b,r] = dmul(Gamma{5}, so[b,r]) + so[b,r] = dmul(Gamma{5}, so[b,r]) end end return nothing @@ -369,19 +402,19 @@ function krnl_g5Dslsh!(so, U, si, th, lp::SpaceParm{4,6,B,D}) where {D,B} so[b,r] = 4*si[b,r] - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) so[b,r] = dmul(Gamma{5}, so[b,r]) end @@ -393,11 +426,11 @@ function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::SpaceParm{4,6,B,D}) where {B, @inbounds begin - b = Int64(CUDA.threadIdx().x); - r = Int64(CUDA.blockIdx().x) + b = Int64(CUDA.threadIdx().x); + r = Int64(CUDA.blockIdx().x) so[b,r] += 0.5*csw*im*dmul(Gamma{5},( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) + -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) end return nothing @@ -409,15 +442,15 @@ function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::Union{SpaceParm{4,6,BC_SF_ORB @inbounds begin - b = Int64(CUDA.threadIdx().x); - r = Int64(CUDA.blockIdx().x) + b = Int64(CUDA.threadIdx().x); + r = Int64(CUDA.blockIdx().x) - if (point_time((b,r),lp) != 1) + if (point_time((b,r),lp) != 1) - so[b,r] += 0.5*csw*im*dmul(Gamma{5},( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) - end + so[b,r] += 0.5*csw*im*dmul(Gamma{5},( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) + end - return nothing + return nothing end end diff --git a/src/Dirac/Diracoper.jl b/src/Dirac/Diracoper.jl new file mode 100644 index 0000000..ba363e1 --- /dev/null +++ b/src/Dirac/Diracoper.jl @@ -0,0 +1,664 @@ + + + + +## OPEN + +""" + function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Computes the Dirac operator (with the Wilson term) `\`\``D_w``\`\` with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. +If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. + +""" +function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + else + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + end + SF_bndfix!(so,lp) + + return nothing +end + +function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + +function krnl_Dw!(so, U, si, m0, tm, th, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + + +""" + function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Computes \`\` \\gamma_5 \`\` times the Dirac operator (with the Wilson term) with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. +If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. +""" +function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + else + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + end + SF_bndfix!(so,lp) + + return nothing +end + +function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] + + return nothing +end + +function krnl_g5Dw!(so, U, si, m0, tm, th, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] + + return nothing +end + +""" + function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Applies the operator \`\` \\gamma_5 D_w \`\` twice to `si` and stores the result in `so`. This is equivalent to appling the operator \`\` D_w^\\dagger D_w \`\` +The Dirac operator is the same as in the functions `Dw!` and `g5Dw!` +""" +function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + SF_bndfix!(so,lp) + end + else + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, dpar.ct, lp) + end + end + SF_bndfix!(so,lp) + end + end + + return nothing +end + +## PERDIODIC + +function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + else + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + end + + return nothing +end + +function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r]+ im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + end + + return nothing +end + +function krnl_Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + end + + return nothing +end + +function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + else + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + end + + return nothing +end + +function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] + end + + return nothing +end + +function krnl_g5Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] + end + + return nothing +end + +function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, lp) + end + end + end + else + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, lp) + end + end + end end + + return nothing +end + +## SF + +function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + else + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + end + + return nothing +end + +function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + +function krnl_Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + + +function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + else + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + end + + return nothing +end + +function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] + + return nothing +end + +function krnl_g5Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] + + return nothing +end + +function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + SF_bndfix!(so,lp) + end + else + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, dpar.ct, lp) + end + end + SF_bndfix!(so,lp) + end + end + + return nothing +end diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index 35e965b..f9849bd 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -9,7 +9,322 @@ ### created: Mon Jul 12 18:31:19 2021 ### -function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,B,D}) where {T,NB,N,M,B,D} + +## +## OPEN +## +function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,NB,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + ipl = 0 + S = zero(eltype(plx)) + @inbounds begin + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + TOBC = (id1==N) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + + TWP = (I[id1]==1) && (I[id2]==1) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + + (b3, r3) = up((b1,r1), id1, lp) + + gc = U[b3,id2,r3] + + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + # END staples + + ga = U[bu1,id2,ru1] + + g2 = U[b,id2,r]\U[b,id1,r] + + if ( (it == lp.iL[end]) || (it == 1) ) && !TOBC + S += 0.5*cG*(c0*tr(g2*ga/U[bu2,id1,ru2]) + c1*tr(g2*ga/h3) + c1*tr(g2*h2/U[bu2,id1,ru2])) + elseif (it == lp.iL[end]-1) && TOBC + S += c0*tr(g2*ga/U[bu2,id1,ru2]) + c1*tr(g2*ga/h3) + elseif (it == lp.iL[end]) && TOBC + nothing + else + if TWP + S += (ztw[ipl]*c0)*tr(g2*ga/U[bu2,id1,ru2]) + else + S += c0*tr(g2*ga/U[bu2,id1,ru2]) + end + if TWH2 + S += (ztw[ipl]*c1)*tr(g2*h2/U[bu2,id1,ru2]) + else + S += c1*tr(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + S += (ztw[ipl]*c1)*tr(g2*ga/h3) + else + S += c1*tr(g2*ga/h3) + end + end + + end + end + + plx[I] = S + end + + return nothing +end + +function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + S = zero(eltype(plx)) + ipl = 0 + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + TOBC = (id1==N) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + TWP = (I[id1]==1) && (I[id2]==1) + + gt1 = U[bu1,id2,ru1] + + if ( (it == lp.iL[end]) || (it == 1)) && !TOBC + S += 0.5*cG*(tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2]))) + elseif (it == lp.iL[end]) && TOBC + nothing + else + if TWP + S += ztw[ipl]*tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + else + S += tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + end + end + end + end + + plx[I] = S + end + + return nothing +end + +function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + TWP = (I[id1]==1)&&(I[id2]==1) + + TOBC = (id1 == N) + + gt1 = U[bu1,id2,ru1] + + g1 = gt1/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if !TOBC && ( (it == 1) || (it == lp.iL[end]) ) + X = 0.5*cG*projalg(U[b,id1,r]*g1/U[b,id2,r]) + + frc1[b ,id1, r ] -= X + frc1[b ,id2, r ] += X + frc2[bu1,id2,ru1] -= 0.5*cG*projalg(g1*g2) + frc2[bu2,id1,ru2] += 0.5*cG*projalg(g2*g1) + elseif TOBC && (it == lp.iL[end]) + nothing + else + if TWP + X = projalg(ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(ztw,g2*g1) + else + X = projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(g1*g2) + frc2[bu2,id1,ru2] += projalg(g2*g1) + end + frc1[b ,id1, r ] -= X + frc1[b ,id2, r ] += X + end + end + + return nothing +end + +function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + + TOBC = (id1 == N) + TWP = (I[id1]==1) && (I[id2]==1) + TWH1 = TWP || ( (I[id1]==1) && (I[id2]==2) ) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + TWH4 = TWP || ( (I[id1]==2) && (I[id2]==1) ) + + # H1 staple + (b1, r1) = dw((b,r), id2, lp) + (b2, r2) = up((b1,r1), id1, lp) + gc = U[b2,id2,r2] + h1 = (U[b1,id2,r1]\U[b1,id1,r1])*gc + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + (b3, r3) = up((b1,r1), id1, lp) + gc = U[b3,id2,r3] + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + + # H4 staple + (b1, r1) = dw((b,r), id1, lp) + (b2, r2) = up((b1,r1), id2, lp) + h4 = (U[b1,id1,r1]\U[b1,id2,r1])*U[b2,id1,r2] + # END staples + + ga = U[bu1,id2,ru1] + + g1 = ga/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if !TOBC && ( (it == 1) || (it == lp.iL[end]) ) + X = 0.5*cG*(c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) ) + + frc1[b,id1,r] -= X + 0.5*cG*c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id2,r] += X + 0.5*cG*c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= 0.5*cG*c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += 0.5*cG*c0*projalg(g2*g1) + frc2[bu1,id2,ru1] -= 0.5*cG*c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += 0.5*cG*c1*projalg((U[b,id2,r]\h1)*g1) + frc2[bu2,id1,ru2] += 0.5*cG*c1*projalg(g2*h2/U[bu2,id1,ru2]) + frc2[bu1,id2,ru1] -= 0.5*cG*c1*projalg((ga/h3)*g2) + frc2[bu1,id2,ru1] -= 0.5*cG*c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += 0.5*cG*c1*projalg(h4\U[b,id1,r]*g1) + elseif TOBC && (it == lp.iL[end]) + nothing + elseif TOBC && (it == 1) + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + frc2[bu2,id1,ru2] += c1*projalg(g2*h2/U[bu2,id1,ru2]) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + elseif TOBC && (it == (lp.iL[end]-1) ) + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + + frc1[b,id1,r] -= X + c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id2,r] += X + c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + else + if TWP + X = projalg(c0*ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(c0*ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(c0*ztw,g2*g1) + else + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + end + if TWH1 + frc1[b,id2,r] += projalg(ztw*c1,h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += projalg(ztw*c1,(U[b,id2,r]\h1)*g1) + else + frc1[b,id2,r] += c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + end + if TWH2 + X += projalg(ztw*c1,U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += projalg(ztw*c1,g2*h2/U[bu2,id1,ru2]) + else + X += c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += c1*projalg(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + X += projalg(ztw*c1,U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) + else + X += c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + end + if TWH4 + frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) + else + frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + end + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + + end + + end + + return nothing +end + + +## +## SF +## +function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,NB,N,M,D} b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) @@ -21,8 +336,8 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt @inbounds begin for id1 in N:-1:1 bu1, ru1 = up((b, r), id1, lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1==N) - + SFBC = (id1==N) + for id2 = 1:id1-1 bu2, ru2 = up((b, r), id2, lp) ipl = ipl + 1 @@ -30,7 +345,7 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt TWP = (I[id1]==1) && (I[id2]==1) TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) - + # H2 staple (b1, r1) = up((b,r), id1, lp) (b2, r2) = up((b1,r1), id1, lp) @@ -39,14 +354,14 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt else gb = U[b2,id2,r2] end - + (b2, r2) = up((b1,r1), id2, lp) h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] - + # H3 staple (b1, r1) = up((b,r), id2, lp) (b2, r2) = up((b1,r1), id2, lp) - + (b3, r3) = up((b1,r1), id1, lp) if SFBC && (it == lp.iL[end]) gc = Ubnd[id2] @@ -55,15 +370,15 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt end h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc # END staples - + if SFBC && (it == lp.iL[end]) ga = Ubnd[id2] else ga = U[bu1,id2,ru1] end - + g2 = U[b,id2,r]\U[b,id1,r] - + if (it == lp.iL[end]) && SFBC S += cG*(c0*tr(g2*ga/U[bu2,id1,ru2]) + (3*c1/2)*tr(g2*ga/h3)) elseif (it == 1) && SFBC @@ -85,17 +400,17 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt S += c1*tr(g2*ga/h3) end end - + end end - + plx[I] = S end - + return nothing end -function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} @inbounds begin @@ -103,21 +418,20 @@ function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) it = I[N] - IBND = ( ( (B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && - ( (it == 1) || (it == lp.iL[end])) ) - + IBND = ( (it == 1) || (it == lp.iL[end])) + S = zero(eltype(plx)) ipl = 0 for id1 in N:-1:1 bu1, ru1 = up((b, r), id1, lp) - SFBND = IBND && (id1 == N) + SFBND = IBND && (id1 == N) for id2 = 1:id1-1 bu2, ru2 = up((b, r), id2, lp) ipl = ipl + 1 TWP = (I[id1]==1) && (I[id2]==1) - - if SFBND && (it == lp.iL[end]) + + if SFBND && (it == lp.iL[end]) gt1 = Ubnd[id2] else gt1 = U[bu1,id2,ru1] @@ -134,46 +448,46 @@ function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B end end end - + plx[I] = S end - + return nothing end -function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) it = I[N] - + @inbounds begin id1, id2 = lp.plidx[ipl] bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) TWP = (I[id1]==1)&&(I[id2]==1) - - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == N) - + + SFBC = (id1 == N) + if SFBC && (it == lp.iL[end]) gt1 = Ubnd[id2] else gt1 = U[bu1,id2,ru1] end - + g1 = gt1/U[bu2,id1,ru2] g2 = U[b,id2,r]\U[b,id1,r] - + if SFBC && (it == 1) X = cG*projalg(U[b,id1,r]*g1/U[b,id2,r]) - + frc1[b ,id1, r ] -= X frc2[bu1,id2,ru1] -= cG*projalg(g1*g2) frc2[bu2,id1,ru2] += cG*projalg(g2*g1) elseif SFBC && (it == lp.iL[end]) X = cG*projalg(U[b,id1,r]*g1/U[b,id2,r]) - + frc1[b ,id1, r ] -= X frc1[b ,id2, r ] += X frc2[bu2,id1,ru2] += cG*projalg(g2*g1) @@ -191,29 +505,29 @@ function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, frc1[b ,id2, r ] += X end end - + return nothing end -function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) it = I[N] - + @inbounds begin id1, id2 = lp.plidx[ipl] bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) - - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == N) + + SFBC = (id1 == N) TWP = (I[id1]==1) && (I[id2]==1) TWH1 = TWP || ( (I[id1]==1) && (I[id2]==2) ) TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) TWH4 = TWP || ( (I[id1]==2) && (I[id2]==1) ) - + # H1 staple (b1, r1) = dw((b,r), id2, lp) (b2, r2) = up((b1,r1), id1, lp) @@ -223,7 +537,7 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, gc = U[b2,id2,r2] end h1 = (U[b1,id2,r1]\U[b1,id1,r1])*gc - + # H2 staple (b1, r1) = up((b,r), id1, lp) (b2, r2) = up((b1,r1), id1, lp) @@ -232,10 +546,10 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, else gb = U[b2,id2,r2] end - + (b2, r2) = up((b1,r1), id2, lp) h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] - + # H3 staple (b1, r1) = up((b,r), id2, lp) (b2, r2) = up((b1,r1), id2, lp) @@ -246,42 +560,42 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, gc = U[b3,id2,r3] end h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc - + # H4 staple (b1, r1) = dw((b,r), id1, lp) (b2, r2) = up((b1,r1), id2, lp) h4 = (U[b1,id1,r1]\U[b1,id2,r1])*U[b2,id1,r2] # END staples - + if SFBC && (it == lp.iL[end]) ga = Ubnd[id2] else ga = U[bu1,id2,ru1] end - + g1 = ga/U[bu2,id1,ru2] g2 = U[b,id2,r]\U[b,id1,r] - + if SFBC && (it == 1) X = (cG*c0)*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + - (3*c1*cG/2)*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) - + (3*c1*cG/2)*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc1[b,id1,r] -= X - + frc2[bu1,id2,ru1] -= (cG*c0)*projalg(g1*g2) + (3*c1*cG/2)*projalg((ga/h3)*g2) + (3*c1*cG/2)*projalg((g1/U[b,id2,r])*h1) - + frc2[bu2,id1,ru2] += (cG*c0)*projalg(g2*g1) + (3*c1*cG/2) * projalg((U[b,id2,r]\h1)*g1) + - c1*projalg(g2*h2/U[bu2,id1,ru2]) + c1*projalg(g2*h2/U[bu2,id1,ru2]) elseif SFBC && (it == lp.iL[end]) X = (cG*c0)*projalg(U[b,id1,r]*g1/U[b,id2,r]) + - (3*c1*cG/2) * (projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3))) - - frc1[b,id1,r] -= X + c1*projalg(U[b,id1,r]*g1/h4) - frc1[b,id2,r] += X + (3*c1*cG/2)*projalg(h1*g1/U[b,id2,r]) - + (3*c1*cG/2) * (projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3))) + + frc1[b,id1,r] -= X + c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id2,r] += X + (3*c1*cG/2)*projalg(h1*g1/U[b,id2,r]) + frc2[bu2,id1,ru2] += (cG*c0)*projalg(g2*g1) + (3*c1*cG/2) * projalg((U[b,id2,r]\h1)*g1) + - c1 * projalg(h4\U[b,id1,r]*g1) + c1 * projalg(h4\U[b,id1,r]*g1) else if TWP X = projalg(c0*ztw,U[b,id1,r]*g1/U[b,id2,r]) @@ -294,11 +608,11 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, end if TWH1 frc1[b,id2,r] += projalg(ztw*c1,h1*g1/U[b,id2,r]) - frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) frc2[bu2,id1,ru2] += projalg(ztw*c1,(U[b,id2,r]\h1)*g1) else frc1[b,id2,r] += c1*projalg(h1*g1/U[b,id2,r]) - frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) end if TWH2 @@ -310,27 +624,274 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, end if TWH3 X += projalg(ztw*c1,U[b,id1,r]*ga/(U[b,id2,r]*h3)) - frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) else X += c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) - frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) end if TWH4 - frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) + frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/h4)*U[b,id1,r]) - frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) + frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) else - frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) - frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + end + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + + end + + end + + return nothing +end + + + +## +## PERIODIC +## +function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,NB,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + ipl = 0 + S = zero(eltype(plx)) + @inbounds begin + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + + TWP = (I[id1]==1) && (I[id2]==1) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + + (b3, r3) = up((b1,r1), id1, lp) + + gc = U[b3,id2,r3] + + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + # END staples + + ga = U[bu1,id2,ru1] + + g2 = U[b,id2,r]\U[b,id1,r] + + if TWP + S += (ztw[ipl]*c0)*tr(g2*ga/U[bu2,id1,ru2]) + else + S += c0*tr(g2*ga/U[bu2,id1,ru2]) + end + if TWH2 + S += (ztw[ipl]*c1)*tr(g2*h2/U[bu2,id1,ru2]) + else + S += c1*tr(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + S += (ztw[ipl]*c1)*tr(g2*ga/h3) + else + S += c1*tr(g2*ga/h3) + end + end - frc1[b,id1,r] -= X - frc1[b,id2,r] += X - end + plx[I] = S end + + return nothing +end + +function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,N,M,D} + + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + S = zero(eltype(plx)) + ipl = 0 + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + TWP = (I[id1]==1) && (I[id2]==1) + + gt1 = U[bu1,id2,ru1] + + if TWP + S += ztw[ipl]*tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + else + S += tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + end + end + end + plx[I] = S + end + + return nothing +end + +function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + TWP = (I[id1]==1)&&(I[id2]==1) + + gt1 = U[bu1,id2,ru1] + + g1 = gt1/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if TWP + X = projalg(ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(ztw,g2*g1) + else + X = projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(g1*g2) + frc2[bu2,id1,ru2] += projalg(g2*g1) + end + frc1[b ,id1, r ] -= X + frc1[b ,id2, r ] += X + end + + return nothing +end + +function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + + TWP = (I[id1]==1) && (I[id2]==1) + TWH1 = TWP || ( (I[id1]==1) && (I[id2]==2) ) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + TWH4 = TWP || ( (I[id1]==2) && (I[id2]==1) ) + + # H1 staple + (b1, r1) = dw((b,r), id2, lp) + (b2, r2) = up((b1,r1), id1, lp) + + gc = U[b2,id2,r2] + + h1 = (U[b1,id2,r1]\U[b1,id1,r1])*gc + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + (b3, r3) = up((b1,r1), id1, lp) + + gc = U[b3,id2,r3] + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + + # H4 staple + (b1, r1) = dw((b,r), id1, lp) + (b2, r2) = up((b1,r1), id2, lp) + h4 = (U[b1,id1,r1]\U[b1,id2,r1])*U[b2,id1,r2] + # END staples + + ga = U[bu1,id2,ru1] + + g1 = ga/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if TWP + X = projalg(c0*ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(c0*ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(c0*ztw,g2*g1) + else + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + end + if TWH1 + frc1[b,id2,r] += projalg(ztw*c1,h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += projalg(ztw*c1,(U[b,id2,r]\h1)*g1) + else + frc1[b,id2,r] += c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + end + if TWH2 + X += projalg(ztw*c1,U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += projalg(ztw*c1,g2*h2/U[bu2,id1,ru2]) + else + X += c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += c1*projalg(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + X += projalg(ztw*c1,U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) + else + X += c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + end + if TWH4 + frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) + else + frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + end + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + + + end + return nothing end @@ -388,4 +949,3 @@ function force_pln!(frc1, ftmp, U, Ubnd, cG, ztw, lp::SpaceParm, c0=1) return nothing end - diff --git a/src/YM/YMfields.jl b/src/YM/YMfields.jl index e22625a..95d6777 100644 --- a/src/YM/YMfields.jl +++ b/src/YM/YMfields.jl @@ -15,7 +15,7 @@ Given an algebra field with natural indexing, this routine sets the components to random Gaussian distributed values. If SF boundary conditions are used, the force at the boundaries is set to zero. """ function randomize!(f, lp::SpaceParm, ymws::YMworkspace) - + if ymws.ALG == SU2alg @timeit "Randomize SU(2) algebra field" begin m = CUDA.randn(ymws.PRC, lp.bsz,lp.ndim,3,lp.rsz) @@ -54,31 +54,44 @@ function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,BC_PERIODI return nothing end -function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + for id in 1:lp.ndim + frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], + m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], + m[b,id,7,r], m[b,id,8,r]) + end + end + + return nothing +end + +function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} @inbounds begin b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) it = point_time((b,r), lp) - if ((B==BC_SF_AFWB)||(B==BC_SF_ORBI)) - if it == 1 - for id in 1:lp.ndim-1 - frc[b,id,r] = zero(T) - end - frc[b,N,r] = SU3alg(m[b,N,1,r], m[b,N,2,r], m[b,N,3,r], - m[b,N,4,r], m[b,N,5,r], m[b,N,6,r], - m[b,N,7,r], m[b,N,8,r]) - else - for id in 1:lp.ndim - frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], - m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], - m[b,id,7,r], m[b,id,8,r]) - end + if it == 1 + for id in 1:lp.ndim-1 + frc[b,id,r] = zero(T) + end + frc[b,N,r] = SU3alg(m[b,N,1,r], m[b,N,2,r], m[b,N,3,r], + m[b,N,4,r], m[b,N,5,r], m[b,N,6,r], + m[b,N,7,r], m[b,N,8,r]) + else + for id in 1:lp.ndim + frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], + m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], + m[b,id,7,r], m[b,id,8,r]) end end end - + return nothing end diff --git a/src/YM/YMflow.jl b/src/YM/YMflow.jl index 370c113..4f841c3 100644 --- a/src/YM/YMflow.jl +++ b/src/YM/YMflow.jl @@ -134,7 +134,8 @@ function krnl_add_zth!(frc, frc2::AbstractArray{TA}, U::AbstractArray{TG}, lp::S r = Int64(CUDA.blockIdx().x) it = point_time((b, r), lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + OBC = (B == BC_OPEN) @inbounds for id in 1:N bu, ru = up((b,r), id, lp) @@ -152,13 +153,21 @@ function krnl_add_zth!(frc, frc2::AbstractArray{TA}, U::AbstractArray{TG}, lp::S frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + projalg(U[b,id,r]*X/U[b,id,r])) end - else + end + if OBC + if (it > 1) && (it < lp.iL[end]) + frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + + projalg(U[b,id,r]*X/U[b,id,r])) + elseif ((it == lp.iL[end]) || (it == 1)) && (id < N) + frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + + projalg(U[b,id,r]*X/U[b,id,r])) + end + else frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + projalg(U[b,id,r]*X/U[b,id,r])) end end end - return nothing end @@ -264,7 +273,8 @@ function Eoft_plaq(Eslc, U, gp::GaugeParm{T,G,NN}, lp::SpaceParm{N,M,B,D}, ymws: @timeit "E(t) plaquette measurement" begin ztw = ztwist(gp, lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + OBC = (B == BC_OPEN) tp = ntuple(i->i, N-1) V3 = prod(lp.iL[1:end-1]) @@ -285,6 +295,10 @@ function Eoft_plaq(Eslc, U, gp::GaugeParm{T,G,NN}, lp::SpaceParm{N,M,B,D}, ymws: if !SFBC Eslc[1,ipl] = Etmp[1] + Etmp[end] end + if OBC ## Check normalization of timelike boundary plaquettes + Eslc[end,ipl] = Etmp[end-1] + Eslc[1,ipl] = Etmp[1] + end else for it in 1:lp.iL[end] Eslc[it,ipl] = 2*Etmp[it] @@ -327,7 +341,6 @@ function krnl_plaq_pln!(plx, U::AbstractArray{T}, Ubnd, ztw, ipl, lp::SpaceParm{ plx[I] = tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2])) end end - return nothing end @@ -350,21 +363,18 @@ function Qtop(Qslc, U, gp::GaugeParm, lp::SpaceParm{4,M,B,D}, ymws::YMworkspace) CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_add_qd!(ymws.rm, -, ymws.frc1, ymws.frc2, lp) end - CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_field_tensor!(ymws.frc1, ymws.frc2, U, gp.Ubnd, 2,4, ztw[2], ztw[4], lp) end CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_add_qd!(ymws.rm, +, ymws.frc1, ymws.frc2, lp) end - CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_field_tensor!(ymws.frc1, ymws.frc2, U, gp.Ubnd, 3,6, ztw[3], ztw[6], lp) end CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_add_qd!(ymws.rm, -, ymws.frc1, ymws.frc2, lp) end - Qslc .= reshape(Array(CUDA.reduce(+, ymws.rm; dims=tp)),lp.iL[end])./(32*pi^2) end @@ -445,7 +455,7 @@ function krnl_add_et!(rm, frc1, lp::SpaceParm{4,M,B,D}) where {M,B,D} I = point_coord((b,r), lp) rm[I] = dot(X1,X1) end - + return nothing end @@ -474,6 +484,7 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, #First plane id1, id2 = lp.plidx[ipl1] SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + OBC = ((B == BC_OPEN) && (id1 == 4)) TWP = ((I[id1]==1)&&(I[id2]==1)) bu1, ru1 = up((b, r), id1, lp) @@ -493,6 +504,11 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, frc1[bu1,2,ru1] = zero(TA) frc1[bd,3,rd] = zero(TA) frc1[bu2,4,ru2] = projalg(l2*l1) + elseif OBC && (it == lp.iL[end]) + frc1[b,1,r] = projalg(U[b,id1,r]*l1/U[b,id2,r]) + frc1[bu1,2,ru1] = zero(TA) + frc1[bd,3,rd] = zero(TA) + frc1[bu2,4,ru2] = projalg(l2*l1) else if TWP frc1[b,1,r] = projalg(ztw1, U[b,id1,r]*l1/U[b,id2,r]) @@ -510,6 +526,7 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, # Second plane id1, id2 = lp.plidx[ipl2] SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + OBC = ((B == BC_OPEN) && (id1 == 4)) TWP = ((I[id1]==1)&&(I[id2]==1)) bu1, ru1 = up((b, r), id1, lp) @@ -529,6 +546,11 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, frc2[bu1,2,ru1] = zero(TA) frc2[bd,3,rd] = zero(TA) frc2[bu2,4,ru2] = projalg(l2*l1) + elseif OBC && (it == lp.iL[end]) + frc1[b,1,r] = projalg(U[b,id1,r]*l1/U[b,id2,r]) + frc1[bu1,2,ru1] = zero(TA) + frc1[bd,3,rd] = zero(TA) + frc1[bu2,4,ru2] = projalg(l2*l1) else if TWP frc2[b,1,r] = projalg(ztw2, U[b,id1,r]*l1/U[b,id2,r]) @@ -543,7 +565,5 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, end end end - return nothing end -