From 48b3bf45378b961e378b913375ac2c7f317e2194 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Thu, 16 May 2024 14:43:29 +0200 Subject: [PATCH 01/14] YM OBC first test --- src/YM/YMact.jl | 686 ++++++++++++++++++++++++++++++++++++++++++----- src/YM/YMflow.jl | 42 ++- 2 files changed, 650 insertions(+), 78 deletions(-) diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index 35e965b..e0ec614 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -9,7 +9,314 @@ ### created: Mon Jul 12 18:31:19 2021 ### -function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,B,D}) where {T,NB,N,M,B,D} + +## +## OPEN DEBERIA ESTAR BIEN LAS ACCIONES, CHECKEAR. LAS FUERZAS FALTAN +## +function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,NB,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + ipl = 0 + S = zero(eltype(plx)) + @inbounds begin + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + TOBC = (id1==N) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + + TWP = (I[id1]==1) && (I[id2]==1) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + + (b3, r3) = up((b1,r1), id1, lp) + + gc = U[b3,id2,r3] + + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + # END staples + + ga = U[bu1,id2,ru1] + + g2 = U[b,id2,r]\U[b,id1,r] + + if ( (it == lp.iL[end]) || (it == 1) ) && !TOBC + S += 0.5*cG*(c0*tr(g2*ga/U[bu2,id1,ru2]) + c1*tr(g2*ga/h3) + c1*tr(g2*h2/U[bu2,id1,ru2])) + elseif (it == lp.iL[end]-1) && TOBC + S += c0*tr(g2*ga/U[bu2,id1,ru2]) + c1*tr(g2*ga/h3) + elseif (it == lp.iL[end]) && TOBC + nothing + else + if TWP + S += (ztw[ipl]*c0)*tr(g2*ga/U[bu2,id1,ru2]) + else + S += c0*tr(g2*ga/U[bu2,id1,ru2]) + end + if TWH2 + S += (ztw[ipl]*c1)*tr(g2*h2/U[bu2,id1,ru2]) + else + S += c1*tr(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + S += (ztw[ipl]*c1)*tr(g2*ga/h3) + else + S += c1*tr(g2*ga/h3) + end + end + + end + end + + plx[I] = S + end + + return nothing +end + +function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + S = zero(eltype(plx)) + ipl = 0 + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + TOBC = (id1==N) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + TWP = (I[id1]==1) && (I[id2]==1) + + gt1 = U[bu1,id2,ru1] + + if ( (it == lp.iL[end]) || (it == 1)) && !TOBC + S += 0.5*cG*(c0*tr(g2*ga/U[bu2,id1,ru2])) + elseif (it == lp.iL[end]) && TOBC + nothing + else + if TWP + S += ztw[ipl]*tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + else + S += tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + end + end + end + end + + plx[I] = S + end + + return nothing +end + +function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + TWP = (I[id1]==1)&&(I[id2]==1) + + TOBC = (id1 == N) + + gt1 = U[bu1,id2,ru1] + + g1 = gt1/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if !TOBC && ( (it == 1) || (it == lp.iL[end]) ) + X = 0.5*cG*projalg(U[b,id1,r]*g1/U[b,id2,r]) + + frc1[b ,id1, r ] -= X + frc1[b ,id2, r ] += X + frc2[bu1,id2,ru1] -= 0.5*cG*projalg(g1*g2) + frc2[bu2,id1,ru2] += 0.5*cG*projalg(g2*g1) + elseif TOBC && (it == lp.iL[end]) + nothing + else + if TWP + X = projalg(ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(ztw,g2*g1) + else + X = projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(g1*g2) + frc2[bu2,id1,ru2] += projalg(g2*g1) + end + frc1[b ,id1, r ] -= X + frc1[b ,id2, r ] += X + end + end + + return nothing +end + +function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + + TOBC = (id1 == N) + TWP = (I[id1]==1) && (I[id2]==1) + TWH1 = TWP || ( (I[id1]==1) && (I[id2]==2) ) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + TWH4 = TWP || ( (I[id1]==2) && (I[id2]==1) ) + + # H1 staple + (b1, r1) = dw((b,r), id2, lp) + (b2, r2) = up((b1,r1), id1, lp) + gc = U[b2,id2,r2] + h1 = (U[b1,id2,r1]\U[b1,id1,r1])*gc + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + (b3, r3) = up((b1,r1), id1, lp) + gc = U[b3,id2,r3] + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + + # H4 staple + (b1, r1) = dw((b,r), id1, lp) + (b2, r2) = up((b1,r1), id2, lp) + h4 = (U[b1,id1,r1]\U[b1,id2,r1])*U[b2,id1,r2] + # END staples + + ga = U[bu1,id2,ru1] + + g1 = ga/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if !TOBC && ( (it == 1) || (it == lp.iL[end]) ) + X = 0.5*cG*(c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + c1*projalg(h1*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*g1/h4)) + + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + frc2[bu1,id2,ru1] -= 0.5*cG*c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += 0.5*cG*c0*projalg(g2*g1) + frc2[bu1,id2,ru1] -= 0.5*cG*c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += 0.5*cG*c1*projalg((U[b,id2,r]\h1)*g1) + frc2[bu2,id1,ru2] += 0.5*cG*c1*projalg(g2*h2/U[bu2,id1,ru2]) + frc2[bu1,id2,ru1] -= 0.5*cG*c1*projalg((ga/h3)*g2) + frc2[bu1,id2,ru1] -= 0.5*cG*c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += 0.5*cG*c1*projalg(h4\U[b,id1,r]*g1) + elseif TOBC && (it == lp.iL[end]) + nothing + elseif TOBC && (it == (lp.iL[end]-1) ) + X = (c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + c1*projalg(h1*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*g1/h4)) + + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + + else + if TWP + X = projalg(c0*ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(c0*ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(c0*ztw,g2*g1) + else + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + end + if TWH1 + frc1[b,id2,r] += projalg(ztw*c1,h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += projalg(ztw*c1,(U[b,id2,r]\h1)*g1) + else + frc1[b,id2,r] += c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + end + if TWH2 + X += projalg(ztw*c1,U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += projalg(ztw*c1,g2*h2/U[bu2,id1,ru2]) + else + X += c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += c1*projalg(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + X += projalg(ztw*c1,U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) + else + X += c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + end + if TWH4 + frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) + else + frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + end + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + + end + + end + + return nothing +end + + +## +## SF +## +function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,NB,N,M,D} b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) @@ -21,8 +328,8 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt @inbounds begin for id1 in N:-1:1 bu1, ru1 = up((b, r), id1, lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1==N) - + SFBC = (id1==N) + for id2 = 1:id1-1 bu2, ru2 = up((b, r), id2, lp) ipl = ipl + 1 @@ -30,7 +337,7 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt TWP = (I[id1]==1) && (I[id2]==1) TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) - + # H2 staple (b1, r1) = up((b,r), id1, lp) (b2, r2) = up((b1,r1), id1, lp) @@ -39,14 +346,14 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt else gb = U[b2,id2,r2] end - + (b2, r2) = up((b1,r1), id2, lp) h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] - + # H3 staple (b1, r1) = up((b,r), id2, lp) (b2, r2) = up((b1,r1), id2, lp) - + (b3, r3) = up((b1,r1), id1, lp) if SFBC && (it == lp.iL[end]) gc = Ubnd[id2] @@ -55,15 +362,15 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt end h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc # END staples - + if SFBC && (it == lp.iL[end]) ga = Ubnd[id2] else ga = U[bu1,id2,ru1] end - + g2 = U[b,id2,r]\U[b,id1,r] - + if (it == lp.iL[end]) && SFBC S += cG*(c0*tr(g2*ga/U[bu2,id1,ru2]) + (3*c1/2)*tr(g2*ga/h3)) elseif (it == 1) && SFBC @@ -85,17 +392,17 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, zt S += c1*tr(g2*ga/h3) end end - + end end - + plx[I] = S end - + return nothing end -function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} @inbounds begin @@ -103,21 +410,20 @@ function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) it = I[N] - IBND = ( ( (B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && - ( (it == 1) || (it == lp.iL[end])) ) - + IBND = ( (it == 1) || (it == lp.iL[end])) + S = zero(eltype(plx)) ipl = 0 for id1 in N:-1:1 bu1, ru1 = up((b, r), id1, lp) - SFBND = IBND && (id1 == N) + SFBND = IBND && (id1 == N) for id2 = 1:id1-1 bu2, ru2 = up((b, r), id2, lp) ipl = ipl + 1 TWP = (I[id1]==1) && (I[id2]==1) - - if SFBND && (it == lp.iL[end]) + + if SFBND && (it == lp.iL[end]) gt1 = Ubnd[id2] else gt1 = U[bu1,id2,ru1] @@ -134,46 +440,46 @@ function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B end end end - + plx[I] = S end - + return nothing end -function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) it = I[N] - + @inbounds begin id1, id2 = lp.plidx[ipl] bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) TWP = (I[id1]==1)&&(I[id2]==1) - - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == N) - + + SFBC = (id1 == N) + if SFBC && (it == lp.iL[end]) gt1 = Ubnd[id2] else gt1 = U[bu1,id2,ru1] end - + g1 = gt1/U[bu2,id1,ru2] g2 = U[b,id2,r]\U[b,id1,r] - + if SFBC && (it == 1) X = cG*projalg(U[b,id1,r]*g1/U[b,id2,r]) - + frc1[b ,id1, r ] -= X frc2[bu1,id2,ru1] -= cG*projalg(g1*g2) frc2[bu2,id1,ru2] += cG*projalg(g2*g1) elseif SFBC && (it == lp.iL[end]) X = cG*projalg(U[b,id1,r]*g1/U[b,id2,r]) - + frc1[b ,id1, r ] -= X frc1[b ,id2, r ] += X frc2[bu2,id1,ru2] += cG*projalg(g2*g1) @@ -191,29 +497,29 @@ function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, frc1[b ,id2, r ] += X end end - + return nothing end -function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) it = I[N] - + @inbounds begin id1, id2 = lp.plidx[ipl] bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) - - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == N) + + SFBC = (id1 == N) TWP = (I[id1]==1) && (I[id2]==1) TWH1 = TWP || ( (I[id1]==1) && (I[id2]==2) ) TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) TWH4 = TWP || ( (I[id1]==2) && (I[id2]==1) ) - + # H1 staple (b1, r1) = dw((b,r), id2, lp) (b2, r2) = up((b1,r1), id1, lp) @@ -223,7 +529,7 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, gc = U[b2,id2,r2] end h1 = (U[b1,id2,r1]\U[b1,id1,r1])*gc - + # H2 staple (b1, r1) = up((b,r), id1, lp) (b2, r2) = up((b1,r1), id1, lp) @@ -232,10 +538,10 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, else gb = U[b2,id2,r2] end - + (b2, r2) = up((b1,r1), id2, lp) h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] - + # H3 staple (b1, r1) = up((b,r), id2, lp) (b2, r2) = up((b1,r1), id2, lp) @@ -246,42 +552,42 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, gc = U[b3,id2,r3] end h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc - + # H4 staple (b1, r1) = dw((b,r), id1, lp) (b2, r2) = up((b1,r1), id2, lp) h4 = (U[b1,id1,r1]\U[b1,id2,r1])*U[b2,id1,r2] # END staples - + if SFBC && (it == lp.iL[end]) ga = Ubnd[id2] else ga = U[bu1,id2,ru1] end - + g1 = ga/U[bu2,id1,ru2] g2 = U[b,id2,r]\U[b,id1,r] - + if SFBC && (it == 1) X = (cG*c0)*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + - (3*c1*cG/2)*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) - + (3*c1*cG/2)*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc1[b,id1,r] -= X - + frc2[bu1,id2,ru1] -= (cG*c0)*projalg(g1*g2) + (3*c1*cG/2)*projalg((ga/h3)*g2) + (3*c1*cG/2)*projalg((g1/U[b,id2,r])*h1) - + frc2[bu2,id1,ru2] += (cG*c0)*projalg(g2*g1) + (3*c1*cG/2) * projalg((U[b,id2,r]\h1)*g1) + - c1*projalg(g2*h2/U[bu2,id1,ru2]) + c1*projalg(g2*h2/U[bu2,id1,ru2]) elseif SFBC && (it == lp.iL[end]) X = (cG*c0)*projalg(U[b,id1,r]*g1/U[b,id2,r]) + - (3*c1*cG/2) * (projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3))) - - frc1[b,id1,r] -= X + c1*projalg(U[b,id1,r]*g1/h4) - frc1[b,id2,r] += X + (3*c1*cG/2)*projalg(h1*g1/U[b,id2,r]) - + (3*c1*cG/2) * (projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3))) + + frc1[b,id1,r] -= X + c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id2,r] += X + (3*c1*cG/2)*projalg(h1*g1/U[b,id2,r]) + frc2[bu2,id1,ru2] += (cG*c0)*projalg(g2*g1) + (3*c1*cG/2) * projalg((U[b,id2,r]\h1)*g1) + - c1 * projalg(h4\U[b,id1,r]*g1) + c1 * projalg(h4\U[b,id1,r]*g1) else if TWP X = projalg(c0*ztw,U[b,id1,r]*g1/U[b,id2,r]) @@ -294,11 +600,11 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, end if TWH1 frc1[b,id2,r] += projalg(ztw*c1,h1*g1/U[b,id2,r]) - frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) frc2[bu2,id1,ru2] += projalg(ztw*c1,(U[b,id2,r]\h1)*g1) else frc1[b,id2,r] += c1*projalg(h1*g1/U[b,id2,r]) - frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) end if TWH2 @@ -310,27 +616,274 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, end if TWH3 X += projalg(ztw*c1,U[b,id1,r]*ga/(U[b,id2,r]*h3)) - frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) else X += c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) - frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) end if TWH4 - frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) + frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/h4)*U[b,id1,r]) - frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) + frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) else - frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) - frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + end + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + + end + + end + + return nothing +end + + + +## +## PERIODIC +## +function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,NB,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + ipl = 0 + S = zero(eltype(plx)) + @inbounds begin + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + + TWP = (I[id1]==1) && (I[id2]==1) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + + (b3, r3) = up((b1,r1), id1, lp) + + gc = U[b3,id2,r3] + + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + # END staples + + ga = U[bu1,id2,ru1] + + g2 = U[b,id2,r]\U[b,id1,r] + + if TWP + S += (ztw[ipl]*c0)*tr(g2*ga/U[bu2,id1,ru2]) + else + S += c0*tr(g2*ga/U[bu2,id1,ru2]) + end + if TWH2 + S += (ztw[ipl]*c1)*tr(g2*h2/U[bu2,id1,ru2]) + else + S += c1*tr(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + S += (ztw[ipl]*c1)*tr(g2*ga/h3) + else + S += c1*tr(g2*ga/h3) + end + end - frc1[b,id1,r] -= X - frc1[b,id2,r] += X - end + plx[I] = S end + + return nothing +end + +function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,N,M,D} + + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + S = zero(eltype(plx)) + ipl = 0 + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + ipl = ipl + 1 + TWP = (I[id1]==1) && (I[id2]==1) + + gt1 = U[bu1,id2,ru1] + + if TWP + S += ztw[ipl]*tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + else + S += tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2])) + end + end + end + plx[I] = S + end + + return nothing +end + +function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + TWP = (I[id1]==1)&&(I[id2]==1) + + gt1 = U[bu1,id2,ru1] + + g1 = gt1/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if TWP + X = projalg(ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(ztw,g2*g1) + else + X = projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(g1*g2) + frc2[bu2,id1,ru2] += projalg(g2*g1) + end + frc1[b ,id1, r ] -= X + frc1[b ,id2, r ] += X + end + + return nothing +end + +function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,N,M,D} + + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + + TWP = (I[id1]==1) && (I[id2]==1) + TWH1 = TWP || ( (I[id1]==1) && (I[id2]==2) ) + TWH2 = TWP || ( (I[id1]==lp.iL[id1]) && (I[id2]==1) ) + TWH3 = TWP || ( (I[id1]==1) && (I[id2]==lp.iL[id2]) ) + TWH4 = TWP || ( (I[id1]==2) && (I[id2]==1) ) + + # H1 staple + (b1, r1) = dw((b,r), id2, lp) + (b2, r2) = up((b1,r1), id1, lp) + + gc = U[b2,id2,r2] + + h1 = (U[b1,id2,r1]\U[b1,id1,r1])*gc + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + (b2, r2) = up((b1,r1), id1, lp) + + gb = U[b2,id2,r2] + + (b2, r2) = up((b1,r1), id2, lp) + h2 = (U[b1,id1,r1]*gb)/U[b2,id1,r2] + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + (b2, r2) = up((b1,r1), id2, lp) + (b3, r3) = up((b1,r1), id1, lp) + + gc = U[b3,id2,r3] + h3 = (U[b1,id2,r1]*U[b2,id1,r2])/gc + + # H4 staple + (b1, r1) = dw((b,r), id1, lp) + (b2, r2) = up((b1,r1), id2, lp) + h4 = (U[b1,id1,r1]\U[b1,id2,r1])*U[b2,id1,r2] + # END staples + + ga = U[bu1,id2,ru1] + + g1 = ga/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + if TWP + X = projalg(c0*ztw,U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(c0*ztw,g1*g2) + frc2[bu2,id1,ru2] += projalg(c0*ztw,g2*g1) + else + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + end + if TWH1 + frc1[b,id2,r] += projalg(ztw*c1,h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += projalg(ztw*c1,(U[b,id2,r]\h1)*g1) + else + frc1[b,id2,r] += c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + end + if TWH2 + X += projalg(ztw*c1,U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += projalg(ztw*c1,g2*h2/U[bu2,id1,ru2]) + else + X += c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + frc2[bu2,id1,ru2] += c1*projalg(g2*h2/U[bu2,id1,ru2]) + end + if TWH3 + X += projalg(ztw*c1,U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(ga/h3)*g2) + else + X += c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + end + if TWH4 + frc1[b,id1,r] -= projalg(ztw*c1,U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= projalg(ztw*c1,(g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += projalg(ztw*c1,h4\U[b,id1,r]*g1) + else + frc1[b,id1,r] -= c1*projalg(U[b,id1,r]*g1/h4) + frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) + frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) + end + frc1[b,id1,r] -= X + frc1[b,id2,r] += X + + + end + return nothing end @@ -388,4 +941,3 @@ function force_pln!(frc1, ftmp, U, Ubnd, cG, ztw, lp::SpaceParm, c0=1) return nothing end - diff --git a/src/YM/YMflow.jl b/src/YM/YMflow.jl index 370c113..4f841c3 100644 --- a/src/YM/YMflow.jl +++ b/src/YM/YMflow.jl @@ -134,7 +134,8 @@ function krnl_add_zth!(frc, frc2::AbstractArray{TA}, U::AbstractArray{TG}, lp::S r = Int64(CUDA.blockIdx().x) it = point_time((b, r), lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + OBC = (B == BC_OPEN) @inbounds for id in 1:N bu, ru = up((b,r), id, lp) @@ -152,13 +153,21 @@ function krnl_add_zth!(frc, frc2::AbstractArray{TA}, U::AbstractArray{TG}, lp::S frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + projalg(U[b,id,r]*X/U[b,id,r])) end - else + end + if OBC + if (it > 1) && (it < lp.iL[end]) + frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + + projalg(U[b,id,r]*X/U[b,id,r])) + elseif ((it == lp.iL[end]) || (it == 1)) && (id < N) + frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + + projalg(U[b,id,r]*X/U[b,id,r])) + end + else frc2[b,id,r] = (5/6)*frc[b,id,r] + (1/6)*(projalg(Ud\Y*Ud) + projalg(U[b,id,r]*X/U[b,id,r])) end end end - return nothing end @@ -264,7 +273,8 @@ function Eoft_plaq(Eslc, U, gp::GaugeParm{T,G,NN}, lp::SpaceParm{N,M,B,D}, ymws: @timeit "E(t) plaquette measurement" begin ztw = ztwist(gp, lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + OBC = (B == BC_OPEN) tp = ntuple(i->i, N-1) V3 = prod(lp.iL[1:end-1]) @@ -285,6 +295,10 @@ function Eoft_plaq(Eslc, U, gp::GaugeParm{T,G,NN}, lp::SpaceParm{N,M,B,D}, ymws: if !SFBC Eslc[1,ipl] = Etmp[1] + Etmp[end] end + if OBC ## Check normalization of timelike boundary plaquettes + Eslc[end,ipl] = Etmp[end-1] + Eslc[1,ipl] = Etmp[1] + end else for it in 1:lp.iL[end] Eslc[it,ipl] = 2*Etmp[it] @@ -327,7 +341,6 @@ function krnl_plaq_pln!(plx, U::AbstractArray{T}, Ubnd, ztw, ipl, lp::SpaceParm{ plx[I] = tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2])) end end - return nothing end @@ -350,21 +363,18 @@ function Qtop(Qslc, U, gp::GaugeParm, lp::SpaceParm{4,M,B,D}, ymws::YMworkspace) CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_add_qd!(ymws.rm, -, ymws.frc1, ymws.frc2, lp) end - CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_field_tensor!(ymws.frc1, ymws.frc2, U, gp.Ubnd, 2,4, ztw[2], ztw[4], lp) end CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_add_qd!(ymws.rm, +, ymws.frc1, ymws.frc2, lp) end - CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_field_tensor!(ymws.frc1, ymws.frc2, U, gp.Ubnd, 3,6, ztw[3], ztw[6], lp) end CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_add_qd!(ymws.rm, -, ymws.frc1, ymws.frc2, lp) end - Qslc .= reshape(Array(CUDA.reduce(+, ymws.rm; dims=tp)),lp.iL[end])./(32*pi^2) end @@ -445,7 +455,7 @@ function krnl_add_et!(rm, frc1, lp::SpaceParm{4,M,B,D}) where {M,B,D} I = point_coord((b,r), lp) rm[I] = dot(X1,X1) end - + return nothing end @@ -474,6 +484,7 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, #First plane id1, id2 = lp.plidx[ipl1] SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + OBC = ((B == BC_OPEN) && (id1 == 4)) TWP = ((I[id1]==1)&&(I[id2]==1)) bu1, ru1 = up((b, r), id1, lp) @@ -493,6 +504,11 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, frc1[bu1,2,ru1] = zero(TA) frc1[bd,3,rd] = zero(TA) frc1[bu2,4,ru2] = projalg(l2*l1) + elseif OBC && (it == lp.iL[end]) + frc1[b,1,r] = projalg(U[b,id1,r]*l1/U[b,id2,r]) + frc1[bu1,2,ru1] = zero(TA) + frc1[bd,3,rd] = zero(TA) + frc1[bu2,4,ru2] = projalg(l2*l1) else if TWP frc1[b,1,r] = projalg(ztw1, U[b,id1,r]*l1/U[b,id2,r]) @@ -510,6 +526,7 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, # Second plane id1, id2 = lp.plidx[ipl2] SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + OBC = ((B == BC_OPEN) && (id1 == 4)) TWP = ((I[id1]==1)&&(I[id2]==1)) bu1, ru1 = up((b, r), id1, lp) @@ -529,6 +546,11 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, frc2[bu1,2,ru1] = zero(TA) frc2[bd,3,rd] = zero(TA) frc2[bu2,4,ru2] = projalg(l2*l1) + elseif OBC && (it == lp.iL[end]) + frc1[b,1,r] = projalg(U[b,id1,r]*l1/U[b,id2,r]) + frc1[bu1,2,ru1] = zero(TA) + frc1[bd,3,rd] = zero(TA) + frc1[bu2,4,ru2] = projalg(l2*l1) else if TWP frc2[b,1,r] = projalg(ztw2, U[b,id1,r]*l1/U[b,id2,r]) @@ -543,7 +565,5 @@ function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, end end end - return nothing end - From 182fa82d1348e9a2e308129cb6660d452ac8aca7 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Fri, 17 May 2024 12:10:42 +0200 Subject: [PATCH 02/14] OBC for fermions. --- src/Dirac/Dirac.jl | 595 +---------------------------------- src/Dirac/Diracfields.jl | 185 +++++++++++ src/Dirac/Diracflow.jl | 189 ++++++----- src/Dirac/Diracoper.jl | 664 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 966 insertions(+), 667 deletions(-) create mode 100644 src/Dirac/Diracfields.jl create mode 100644 src/Dirac/Diracoper.jl diff --git a/src/Dirac/Dirac.jl b/src/Dirac/Dirac.jl index c378419..c11dc74 100644 --- a/src/Dirac/Dirac.jl +++ b/src/Dirac/Dirac.jl @@ -105,500 +105,6 @@ struct DiracWorkspace{T} end -export DiracWorkspace, DiracParam - - -""" - function Csw!(dws, U, gp, lp::SpaceParm) - -Computes the clover and stores it in dws.csw. - -""" -function Csw!(dws, U, gp, lp::SpaceParm{4,6,B,D}) where {B,D} - - @timeit "Csw computation" begin - - for i in 1:Int(lp.npls) - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_csw!(dws.csw, U, gp.Ubnd, i, lp) - end - end - end - - return nothing -end - -function krnl_csw!(csw::AbstractArray{T}, U, Ubnd, ipl, lp::SpaceParm{4,M,B,D}) where {T,M,B,D} - - @inbounds begin - b = Int64(CUDA.threadIdx().x) - r = Int64(CUDA.blockIdx().x) - I = point_coord((b,r), lp) - it = I[4] - - id1, id2 = lp.plidx[ipl] - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) - - bu1, ru1 = up((b, r), id1, lp) - bu2, ru2 = up((b, r), id2, lp) - bd1, rd1 = dw((b, r), id1, lp) - bd2, rd2 = dw((b, r), id2, lp) - bdd, rdd = dw((bd1, rd1), id2, lp) - bud, rud = dw((bu1, ru1), id2, lp) - bdu, rdu = up((bd1, rd1), id2, lp) - - if SFBC && (it == lp.iL[end]) - gt1 = Ubnd[id2] - gt2 = Ubnd[id2] - else - gt1 = U[bu1,id2,ru1] - gt2 = U[bud,id2,rud] - end - - M1 = U[b,id1,r]*gt1/(U[b,id2,r]*U[bu2,id1,ru2]) - M2 = (U[bd2,id2,rd2]\(U[bd2,id1,rd2]*gt2))/U[b,id1,r] - M3 = (U[bdd,id2,rdd]*U[bd1,id1,rd1])\(U[bdd,id1,rdd]*U[bd2,id2,rd2]) - M4 = (U[b,id2,r]/(U[bd1,id2,rd1]*U[bdu,id1,rdu]))*U[bd1,id1,rd1] - - - if !(SFBC && (it == 1)) - csw[b,ipl,r] = 0.125*(antsym(M1)+antsym(M2)+antsym(M3)+antsym(M4)) - end - - end - - return nothing -end - - -""" - function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Computes the Dirac operator (with the Wilson term) `\`\``D_w``\`\` with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. -If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. - -""" -function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) - end - end - else - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) - end - end - end - - return nothing -end - -function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r]+ im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - end - - return nothing -end - -function krnl_Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - end - - return nothing -end - -function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - else - @timeit "Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) - end - end - end - - return nothing -end - -function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - return nothing -end - -function krnl_Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - return nothing -end - -""" - function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Computes \`\` \\gamma_5 \`\` times the Dirac operator (with the Wilson term) with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. -If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. -""" -function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) - end - end - else - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) - end - end - end - - return nothing -end - -function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] - end - - return nothing -end - -function krnl_g5Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] - end - - return nothing -end - -function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - else - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) - end - end - end - - return nothing -end - -function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) - - - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] - - return nothing -end - -function krnl_g5Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - # The field si is assumed to be zero at t = 0 - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) != 1) - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - @inbounds begin - - so[b,r] = (4+m0)*si[b,r] - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - - if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) - so[b,r] += (ct-1.0)*si[b,r] - end - end - end - - so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] - - return nothing -end - -""" - function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Applies the operator \`\` \\gamma_5 D_w \`\` twice to `si` and stores the result in `so`. This is equivalent to appling the operator \`\` D_w^\\dagger D_w \`\` -The Dirac operator is the same as in the functions `Dw!` and `g5Dw!` -""" -function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - SF_bndfix!(dws.st,lp) - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) - end - end - SF_bndfix!(so,lp) - end - else - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) - end - end - SF_bndfix!(dws.st,lp) - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, dpar.ct, lp) - end - end - SF_bndfix!(so,lp) - end - end - - return nothing -end - -function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - if abs(dpar.csw) > 1.0E-10 - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) - end - end - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, lp) - end - end - end - else - @timeit "DwdagDw" begin - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, lp) - end - end - - @timeit "g5Dw" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, lp) - end - end - end - end - - return nothing -end """ function mtwmdpar(dpar::DiracParam) @@ -610,108 +116,19 @@ function mtwmdpar(dpar::DiracParam{P,R}) where {P,R} end -""" - SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) +export DiracWorkspace, DiracParam, mtwmdpar -Sets all the values of `sp` in the first time slice to zero. -""" -function SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - @timeit "SF boundary fix" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_sfbndfix!(sp, lp) - end - end - return nothing -end - -function krnl_sfbndfix!(sp,lp::SpaceParm) - b=Int64(CUDA.threadIdx().x) - r=Int64(CUDA.blockIdx().x) - - if (point_time((b,r),lp) == 1) - sp[b,r] = 0.0*sp[b,r] - end - return nothing -end - - -""" - function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund / SU2fund {T}}}, lp::SpaceParm, t::Int64 = 0) - -Randomizes the SU2fund / SU3fund fermion field. If the argument t is present, it only randomizes that time-slice. -""" -function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm, t::Int64 = 0) where {T} - - @timeit "Randomize pseudofermion field" begin - p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) - end - end - - return nothing -end - -function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64) - - @inbounds begin - b = Int64(CUDA.threadIdx().x) - r = Int64(CUDA.blockIdx().x) - - if t == 0 - f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2], - x[b,3,r,1] + im* x[b,3,r,2]),p)) - elseif point_time((b,r),lp) == t - f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2], - x[b,3,r,1] + im* x[b,3,r,2]),p)) - end - - end - - return nothing -end - -function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}},lp::SpaceParm, t::Int64=0) where {T} - - @timeit "Randomize pseudofermion field" begin - p = ntuple(i->CUDA.randn(T, lp.bsz, 2, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su2!(f,p,lp,t) - end - end - - return nothing -end - -function krnl_assign_pf_su2!(f::AbstractArray, p , lp::SpaceParm, t::Int64) - - @inbounds begin - b = Int64(CUDA.threadIdx().x) - r = Int64(CUDA.blockIdx().x) - - if t == 0 - f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2]),p)) - elseif point_time((b,r),lp) == t - f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2]),p)) - end - - end - - return nothing -end - -export Dw!, g5Dw!, DwdagDw!, SF_bndfix!, Csw!, pfrandomize!, mtwmdpar +include("Diracfields.jl") +export SF_bndfix!, Csw!, pfrandomize! +include("Diracoper.jl") +export Dw!, g5Dw!, DwdagDw! include("DiracIO.jl") export read_prop, save_prop, read_dpar include("Diracflow.jl") -export Dslash_sq!, flw, backflow +export Nablanabla!, Dslash_sq!, flw, backflow end diff --git a/src/Dirac/Diracfields.jl b/src/Dirac/Diracfields.jl new file mode 100644 index 0000000..f7e3edd --- /dev/null +++ b/src/Dirac/Diracfields.jl @@ -0,0 +1,185 @@ + + + +""" + function Csw!(dws, U, gp, lp::SpaceParm) + +Computes the clover and stores it in dws.csw. + +""" +function Csw!(dws, U, gp, lp::SpaceParm{4,6,B,D}) where {B,D} + + @timeit "Csw computation" begin + + for i in 1:Int(lp.npls) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_csw!(dws.csw, U, gp.Ubnd, i, lp) + end + end + end + + return nothing +end + +function krnl_csw!(csw::AbstractArray{T}, U, Ubnd, ipl, lp::SpaceParm{4,M,B,D}) where {T,M,B,D} + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[4] + + id1, id2 = lp.plidx[ipl] + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + OBC = (B == BC_OPEN) && ((it == 1) || (it == lp.iL[end])) + + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + bd1, rd1 = dw((b, r), id1, lp) + bd2, rd2 = dw((b, r), id2, lp) + bdd, rdd = dw((bd1, rd1), id2, lp) + bud, rud = dw((bu1, ru1), id2, lp) + bdu, rdu = up((bd1, rd1), id2, lp) + + if SFBC && (it == lp.iL[end]) + gt1 = Ubnd[id2] + gt2 = Ubnd[id2] + else + gt1 = U[bu1,id2,ru1] + gt2 = U[bud,id2,rud] + end + + M1 = U[b,id1,r]*gt1/(U[b,id2,r]*U[bu2,id1,ru2]) + M2 = (U[bd2,id2,rd2]\(U[bd2,id1,rd2]*gt2))/U[b,id1,r] + M3 = (U[bdd,id2,rdd]*U[bd1,id1,rd1])\(U[bdd,id1,rdd]*U[bd2,id2,rd2]) + M4 = (U[b,id2,r]/(U[bd1,id2,rd1]*U[bdu,id1,rdu]))*U[bd1,id1,rd1] + + + if !(SFBC && (it == 1)) && !OBC + csw[b,ipl,r] = 0.125*(antsym(M1)+antsym(M2)+antsym(M3)+antsym(M4)) + end + + end + + return nothing +end + + + +""" + SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) + +Sets all the values of `sp` in the first time slice to zero. +""" +function SF_bndfix!(sp, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + @timeit "SF boundary fix" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_sfbndfix!(sp, lp) + end + end + return nothing +end + +function krnl_sfbndfix!(sp,lp::SpaceParm) + b=Int64(CUDA.threadIdx().x) + r=Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) == 1) + sp[b,r] = 0.0*sp[b,r] + end + return nothing +end + +""" + SF_bndfix!(sp, lp::SpaceParm{4,6,BC_OPEN,D}) + +Sets all the values of `sp` in the first and last time slice to zero. +""" +function SF_bndfix!(sp, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + @timeit "SF boundary fix" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_opbndfix!(sp, lp) + end + end + return nothing +end + +function krnl_opbndfix!(sp,lp::SpaceParm) + b=Int64(CUDA.threadIdx().x) + r=Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) == 1) || (point_time((b,r),lp) == lp.iL[end])) + sp[b,r] = 0.0*sp[b,r] + end + return nothing +end + + +""" + function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund / SU2fund {T}}}, lp::SpaceParm, t::Int64 = 0) + +Randomizes the SU2fund / SU3fund fermion field. If the argument t is present, it only randomizes that time-slice. +""" +function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm, t::Int64 = 0) where {T} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) + end + end + + return nothing +end + +function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64) + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + + if t == 0 + f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2], + x[b,3,r,1] + im* x[b,3,r,2]),p)) + elseif point_time((b,r),lp) == t + f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2], + x[b,3,r,1] + im* x[b,3,r,2]),p)) + end + + end + + return nothing +end + +function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}},lp::SpaceParm, t::Int64=0) where {T} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 2, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su2!(f,p,lp,t) + end + end + + return nothing +end + +function krnl_assign_pf_su2!(f::AbstractArray, p , lp::SpaceParm, t::Int64) + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + + if t == 0 + f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2]),p)) + elseif point_time((b,r),lp) == t + f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], + x[b,2,r,1] + im* x[b,2,r,2]),p)) + end + + end + + return nothing +end diff --git a/src/Dirac/Diracflow.jl b/src/Dirac/Diracflow.jl index c3706c3..2bbac09 100644 --- a/src/Dirac/Diracflow.jl +++ b/src/Dirac/Diracflow.jl @@ -154,83 +154,6 @@ function bflw_step!(psi, U, eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam return nothing end -""" - - function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) - -Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `si`. - -""" -function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - - @timeit "Laplacian" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) - end - end - - return nothing -end - - - -function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,B,D}) where {B,D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - @inbounds begin - - so[b,r] = -4*si[b,r] - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + - th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + - th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + - th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) - end - - return nothing -end - - -function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} - - b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) - - @inbounds begin - - if (point_time((b,r),lp) != 1) - - so[b,r] = -4*si[b,r] - - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) - - so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + - th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + - th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + - th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) - end - end - - return nothing -end - - function flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, epsini::T, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) where {NI,T} @@ -278,13 +201,123 @@ end flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) where {NI,T} = flw_adapt(U, psi, int, tend, int.eps_ini, gp, dpar, lp, ymws, dws) +""" + + function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `si`. + +""" +function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + @timeit "Laplacian" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) + end + end + return nothing +end +function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}) where {D} + SF_bndfix!(si,lp) + @timeit "Laplacian" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) + end + end + SF_bndfix!(so,lp) + return nothing +end + + +function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + @inbounds begin + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end]) + + so[b,r] = -4*si[b,r] + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + end + end + + return nothing +end + +function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + @inbounds begin + + so[b,r] = -4*si[b,r] + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + end + + return nothing +end + +function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + @inbounds begin + + if (point_time((b,r),lp) != 1) + + so[b,r] = -4*si[b,r] + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + end + end + + return nothing +end + export Nablanabla!, flw, backflow, flw_adapt, bflw_step! """ - function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) Computes /`/` //slashed{D}^2 si /`/` ans stores it in `si`. diff --git a/src/Dirac/Diracoper.jl b/src/Dirac/Diracoper.jl new file mode 100644 index 0000000..e6394d1 --- /dev/null +++ b/src/Dirac/Diracoper.jl @@ -0,0 +1,664 @@ + + + + +## OPEN + +""" + function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Computes the Dirac operator (with the Wilson term) `\`\``D_w``\`\` with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. +If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. + +""" +function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + else + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + end + SF_bndfix!(so,lp) + + return nothing +end + +function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + +function krnl_Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + + +""" + function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Computes \`\` \\gamma_5 \`\` times the Dirac operator (with the Wilson term) with gauge field U and parameters `dpar` of the field `si` and stores it in `so`. +If `dpar.csw` is different from zero, the clover term should be stored in `dws.csw` via the Csw! function and is automatically included in the operator. +""" +function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + else + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + end + SF_bndfix!(so,lp) + + return nothing +end + +function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] + + return nothing +end + +function krnl_g5Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + # The field si is assumed to be zero at t = 0,T + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == (lp.iL[4]-1)) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] + + return nothing +end + +""" + function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) + +Applies the operator \`\` \\gamma_5 D_w \`\` twice to `si` and stores the result in `so`. This is equivalent to appling the operator \`\` D_w^\\dagger D_w \`\` +The Dirac operator is the same as in the functions `Dw!` and `g5Dw!` +""" +function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, lp) + end + end + SF_bndfix!(so,lp) + end + else + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, lp) + end + end + SF_bndfix!(so,lp) + end + end + + return nothing +end + +## PERDIODIC + +function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + else + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + end + + return nothing +end + +function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r]+ im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + end + + return nothing +end + +function krnl_Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + end + + return nothing +end + +function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + else + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + end + + return nothing +end + +function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] + end + + return nothing +end + +function krnl_g5Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] + end + + return nothing +end + +function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + end + end + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, lp) + end + end + end + else + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, lp) + end + end + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, lp) + end + end + end end + + return nothing +end + +## SF + +function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + else + @timeit "Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + end + + return nothing +end + +function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + +function krnl_Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + im*tm*dmul(Gamma{5},si[b,r]) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + return nothing +end + + +function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + SF_bndfix!(si,lp) + if abs(dpar.csw) > 1.0E-10 + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + else + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + end + + return nothing +end + +function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + 0.5*csw*im*( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + +Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) + Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) + Fcsw[b,6,r]*dmul(Gamma{13},si[b,r])) + + + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r])+ im*tm*si[b,r] + + return nothing +end + +function krnl_g5Dw!(so, U, si, m0, tm, th, ct, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + # The field si is assumed to be zero at t = 0 + + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + + if (point_time((b,r),lp) != 1) + + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) + + @inbounds begin + + so[b,r] = (4+m0)*si[b,r] + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + + if (point_time((b,r),lp) == 2) || (point_time((b,r),lp) == lp.iL[4]) + so[b,r] += (ct-1.0)*si[b,r] + end + end + end + + so[b,r] = dmul(Gamma{5}, so[b,r]) + im*tm*si[b,r] + + return nothing +end + +function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} + + if abs(dpar.csw) > 1.0E-10 + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) + end + end + SF_bndfix!(so,lp) + end + else + @timeit "DwdagDw" begin + + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) + end + end + SF_bndfix!(dws.st,lp) + @timeit "g5Dw" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, dpar.ct, lp) + end + end + SF_bndfix!(so,lp) + end + end + + return nothing +end From abfd0bd72a144c881129dd2c77cb85803a2513ba Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Tue, 21 May 2024 15:38:28 +0200 Subject: [PATCH 03/14] Some errors --- src/Dirac/Diracfields.jl | 15 +++- src/Dirac/Diracflow.jl | 154 +++++++++++++++++++-------------------- src/Dirac/Diracoper.jl | 24 +++--- src/YM/YMact.jl | 4 +- src/YM/YMfields.jl | 47 +++++++----- 5 files changed, 135 insertions(+), 109 deletions(-) diff --git a/src/Dirac/Diracfields.jl b/src/Dirac/Diracfields.jl index f7e3edd..488e05a 100644 --- a/src/Dirac/Diracfields.jl +++ b/src/Dirac/Diracfields.jl @@ -120,7 +120,7 @@ end Randomizes the SU2fund / SU3fund fermion field. If the argument t is present, it only randomizes that time-slice. """ -function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm, t::Int64 = 0) where {T} +function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm{4,6,BC_PERIODIC,D}, t::Int64 = 0) where {T,D} @timeit "Randomize pseudofermion field" begin p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 @@ -132,6 +132,19 @@ function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm, t: return nothing end +function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}, t::Int64 = 0) where {T,D} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) + end + end + SF_bndfix!(f,lp) + + return nothing +end + function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64) @inbounds begin diff --git a/src/Dirac/Diracflow.jl b/src/Dirac/Diracflow.jl index 2bbac09..8065840 100644 --- a/src/Dirac/Diracflow.jl +++ b/src/Dirac/Diracflow.jl @@ -30,7 +30,7 @@ function flw(U, psi, int::FlowIntr{NI,T}, ns::Int64, eps, gp::GaugeParm, dpar::D ymws.mom .= int.e0[k].*ymws.mom .+ int.e1[k].*ymws.frc1 U .= expm.(U, ymws.mom, 2*eps) - end + end end end @@ -86,7 +86,7 @@ function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, @timeit "CPU to GPU" copyto!(U,U0) for j in dsave:-1:1 - @timeit "CPU to GPU" copyto!(U,U0) + @timeit "CPU to GPU" copyto!(U,U0) for k in 1:j-1 flw(U, int, 1, eps_all[k], gp, lp, ymws) end @@ -209,20 +209,20 @@ Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `si`. """ function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} - @timeit "Laplacian" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) - end + @timeit "Laplacian" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) end + end return nothing end function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}) where {D} SF_bndfix!(si,lp) - @timeit "Laplacian" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) - end + @timeit "Laplacian" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Nablanabla(so, U, si, dpar.th, lp) end + end SF_bndfix!(so,lp) return nothing end @@ -234,9 +234,9 @@ function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} @inbounds begin - if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end]) + if ((point_time((b,r),lp) != 1) && (point_time((b,r),lp) != lp.iL[end])) - so[b,r] = -4*si[b,r] + so[b,r] = -4*si[b,r] bu1, ru1 = up((b,r), 1, lp) bd1, rd1 = dw((b,r), 1, lp) @@ -247,10 +247,10 @@ function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} bu4, ru4 = up((b,r), 4, lp) bd4, rd4 = dw((b,r), 4, lp) - so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + - th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + - th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + - th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) end end @@ -265,19 +265,19 @@ function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_PERIODIC,D}) where so[b,r] = -4*si[b,r] - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + - th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + - th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + - th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) end return nothing @@ -291,9 +291,9 @@ function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Sp if (point_time((b,r),lp) != 1) - so[b,r] = -4*si[b,r] + so[b,r] = -4*si[b,r] - bu1, ru1 = up((b,r), 1, lp) + bu1, ru1 = up((b,r), 1, lp) bd1, rd1 = dw((b,r), 1, lp) bu2, ru2 = up((b,r), 2, lp) bd2, rd2 = dw((b,r), 2, lp) @@ -302,10 +302,10 @@ function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Sp bu4, ru4 = up((b,r), 4, lp) bd4, rd4 = dw((b,r), 4, lp) - so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + - th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + - th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + - th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) + so[b,r] += 0.5*( th[1] * (U[b,1,r]*si[bu1,ru1]) +conj(th[1]) * (U[bd1,1,rd1]\si[bd1,rd1]) + + th[2] * (U[b,2,r]*si[bu2,ru2]) +conj(th[2]) * (U[bd2,2,rd2]\si[bd2,rd2]) + + th[3] * (U[b,3,r]*si[bu3,ru3]) +conj(th[3]) * (U[bd3,3,rd3]\si[bd3,rd3]) + + th[4] * (U[b,4,r]*si[bu4,ru4]) +conj(th[4]) * (U[bd4,4,rd4]\si[bd4,rd4]) ) end end @@ -325,40 +325,40 @@ Computes /`/` //slashed{D}^2 si /`/` ans stores it in `si`. """ function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) where {B,D} - @timeit "DwdagDw" begin + @timeit "DwdagDw" begin - @timeit "g5Dslsh" begin + @timeit "g5Dslsh" begin CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh!(dws.st, U, si, dpar.th, lp) end - end + end - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw_improvement" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(dws.st, dws.csw, dpar.csw, si, lp) - end + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw_improvement" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(dws.st, dws.csw, dpar.csw, si, lp) end end + end - @timeit "g5Dslsh" begin + @timeit "g5Dslsh" begin CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh!(so, U, dws.st, dpar.th, lp) end - end + end - if abs(dpar.csw) > 1.0E-10 - @timeit "Dw_improvement" begin - CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(so, dws.csw, dpar.csw, dws.st, lp) - end + if abs(dpar.csw) > 1.0E-10 + @timeit "Dw_improvement" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dslsh_impr!(so, dws.csw, dpar.csw, dws.st, lp) end end - - end + + end + return nothing end @@ -382,12 +382,12 @@ function krnl_g5Dslsh!(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Spac bu4, ru4 = up((b,r), 4, lp) bd4, rd4 = dw((b,r), 4, lp) - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) - so[b,r] = dmul(Gamma{5}, so[b,r]) + so[b,r] = dmul(Gamma{5}, so[b,r]) end end return nothing @@ -402,19 +402,19 @@ function krnl_g5Dslsh!(so, U, si, th, lp::SpaceParm{4,6,B,D}) where {D,B} so[b,r] = 4*si[b,r] - bu1, ru1 = up((b,r), 1, lp) - bd1, rd1 = dw((b,r), 1, lp) - bu2, ru2 = up((b,r), 2, lp) - bd2, rd2 = dw((b,r), 2, lp) - bu3, ru3 = up((b,r), 3, lp) - bd3, rd3 = dw((b,r), 3, lp) - bu4, ru4 = up((b,r), 4, lp) - bd4, rd4 = dw((b,r), 4, lp) + bu1, ru1 = up((b,r), 1, lp) + bd1, rd1 = dw((b,r), 1, lp) + bu2, ru2 = up((b,r), 2, lp) + bd2, rd2 = dw((b,r), 2, lp) + bu3, ru3 = up((b,r), 3, lp) + bd3, rd3 = dw((b,r), 3, lp) + bu4, ru4 = up((b,r), 4, lp) + bd4, rd4 = dw((b,r), 4, lp) - so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + - th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + - th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + - th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) + so[b,r] -= 0.5*(th[1]*gpmul(Pgamma{1,-1},U[b,1,r],si[bu1,ru1]) +conj(th[1])*gdagpmul(Pgamma{1,+1},U[bd1,1,rd1],si[bd1,rd1]) + + th[2]*gpmul(Pgamma{2,-1},U[b,2,r],si[bu2,ru2]) +conj(th[2])*gdagpmul(Pgamma{2,+1},U[bd2,2,rd2],si[bd2,rd2]) + + th[3]*gpmul(Pgamma{3,-1},U[b,3,r],si[bu3,ru3]) +conj(th[3])*gdagpmul(Pgamma{3,+1},U[bd3,3,rd3],si[bd3,rd3]) + + th[4]*gpmul(Pgamma{4,-1},U[b,4,r],si[bu4,ru4]) +conj(th[4])*gdagpmul(Pgamma{4,+1},U[bd4,4,rd4],si[bd4,rd4]) ) so[b,r] = dmul(Gamma{5}, so[b,r]) end @@ -426,11 +426,11 @@ function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::SpaceParm{4,6,B,D}) where {B, @inbounds begin - b = Int64(CUDA.threadIdx().x); - r = Int64(CUDA.blockIdx().x) + b = Int64(CUDA.threadIdx().x); + r = Int64(CUDA.blockIdx().x) so[b,r] += 0.5*csw*im*dmul(Gamma{5},( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) + -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) end return nothing @@ -442,15 +442,15 @@ function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::Union{SpaceParm{4,6,BC_SF_ORB @inbounds begin - b = Int64(CUDA.threadIdx().x); - r = Int64(CUDA.blockIdx().x) + b = Int64(CUDA.threadIdx().x); + r = Int64(CUDA.blockIdx().x) - if (point_time((b,r),lp) != 1) + if (point_time((b,r),lp) != 1) - so[b,r] += 0.5*csw*im*dmul(Gamma{5},( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) - -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) - end + so[b,r] += 0.5*csw*im*dmul(Gamma{5},( Fcsw[b,1,r]*dmul(Gamma{10},si[b,r]) + Fcsw[b,2,r]*dmul(Gamma{11},si[b,r]) + Fcsw[b,3,r]*dmul(Gamma{12},si[b,r]) + -Fcsw[b,4,r]*dmul(Gamma{15},si[b,r]) - Fcsw[b,5,r]*dmul(Gamma{14},si[b,r]) - Fcsw[b,6,r]*dmul(Gamma{13},si[b,r]))) + end - return nothing + return nothing end end diff --git a/src/Dirac/Diracoper.jl b/src/Dirac/Diracoper.jl index e6394d1..ba363e1 100644 --- a/src/Dirac/Diracoper.jl +++ b/src/Dirac/Diracoper.jl @@ -17,13 +17,13 @@ function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6 if abs(dpar.csw) > 1.0E-10 @timeit "Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) end end else @timeit "Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) end end end @@ -32,7 +32,7 @@ function Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6 return nothing end -function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} +function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} # The field si is assumed to be zero at t = 0,T b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) @@ -67,7 +67,7 @@ function krnl_Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_OPE return nothing end -function krnl_Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} +function krnl_Dw!(so, U, si, m0, tm, th, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} # The field si is assumed to be zero at t = 0,T b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) @@ -113,13 +113,13 @@ function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4 if abs(dpar.csw) > 1.0E-10 @timeit "g5Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) end end else @timeit "g5Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) end end end @@ -128,7 +128,7 @@ function g5Dw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4 return nothing end -function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} +function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} # The field si is assumed to be zero at t = 0,T b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) @@ -166,7 +166,7 @@ function krnl_g5Dwimpr!(so, U, si, Fcsw, m0, tm, th, csw, lp::SpaceParm{4,6,BC_O return nothing end -function krnl_g5Dw!(so, U, si, m0, tm, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} +function krnl_g5Dw!(so, U, si, m0, tm, th, ct, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} # The field si is assumed to be zero at t = 0,T b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) @@ -215,13 +215,13 @@ function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpacePar @timeit "g5Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(dws.st, U, si, dws.csw, dpar.m0, dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) end end SF_bndfix!(dws.st,lp) @timeit "g5Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dwimpr!(so, U, dws.st, dws.csw, dpar.m0, -dpar.tm, dpar.th, dpar.csw, dpar.ct, lp) end end SF_bndfix!(so,lp) @@ -231,13 +231,13 @@ function DwdagDw!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpacePar @timeit "g5Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(dws.st, U, si, dpar.m0, dpar.tm, dpar.th, dpar.ct, lp) end end SF_bndfix!(dws.st,lp) @timeit "g5Dw" begin CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_g5Dw!(so, U, dws.st, dpar.m0, -dpar.tm, dpar.th, dpar.ct, lp) end end SF_bndfix!(so,lp) diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index e0ec614..500ae0f 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -11,7 +11,7 @@ ## -## OPEN DEBERIA ESTAR BIEN LAS ACCIONES, CHECKEAR. LAS FUERZAS FALTAN +## OPEN ## function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,NB,N,M,D} @@ -113,7 +113,7 @@ function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B gt1 = U[bu1,id2,ru1] if ( (it == lp.iL[end]) || (it == 1)) && !TOBC - S += 0.5*cG*(c0*tr(g2*ga/U[bu2,id1,ru2])) + S += 0.5*cG*(tr(U[b,id1,r]*gt1 / (U[b,id2,r]*U[bu2,id1,ru2]))) elseif (it == lp.iL[end]) && TOBC nothing else diff --git a/src/YM/YMfields.jl b/src/YM/YMfields.jl index e22625a..95d6777 100644 --- a/src/YM/YMfields.jl +++ b/src/YM/YMfields.jl @@ -15,7 +15,7 @@ Given an algebra field with natural indexing, this routine sets the components to random Gaussian distributed values. If SF boundary conditions are used, the force at the boundaries is set to zero. """ function randomize!(f, lp::SpaceParm, ymws::YMworkspace) - + if ymws.ALG == SU2alg @timeit "Randomize SU(2) algebra field" begin m = CUDA.randn(ymws.PRC, lp.bsz,lp.ndim,3,lp.rsz) @@ -54,31 +54,44 @@ function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,BC_PERIODI return nothing end -function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} +function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,BC_OPEN,D}) where {T,N,M,D} + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + for id in 1:lp.ndim + frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], + m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], + m[b,id,7,r], m[b,id,8,r]) + end + end + + return nothing +end + +function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::Union{SpaceParm{N,M,BC_SF_ORBI,D},SpaceParm{N,M,BC_SF_AFWB,D}}) where {T,N,M,D} @inbounds begin b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) it = point_time((b,r), lp) - if ((B==BC_SF_AFWB)||(B==BC_SF_ORBI)) - if it == 1 - for id in 1:lp.ndim-1 - frc[b,id,r] = zero(T) - end - frc[b,N,r] = SU3alg(m[b,N,1,r], m[b,N,2,r], m[b,N,3,r], - m[b,N,4,r], m[b,N,5,r], m[b,N,6,r], - m[b,N,7,r], m[b,N,8,r]) - else - for id in 1:lp.ndim - frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], - m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], - m[b,id,7,r], m[b,id,8,r]) - end + if it == 1 + for id in 1:lp.ndim-1 + frc[b,id,r] = zero(T) + end + frc[b,N,r] = SU3alg(m[b,N,1,r], m[b,N,2,r], m[b,N,3,r], + m[b,N,4,r], m[b,N,5,r], m[b,N,6,r], + m[b,N,7,r], m[b,N,8,r]) + else + for id in 1:lp.ndim + frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], + m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], + m[b,id,7,r], m[b,id,8,r]) end end end - + return nothing end From 794fe15cba66f014378d52908a99e451043290b1 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Sun, 2 Jun 2024 13:30:19 +0200 Subject: [PATCH 04/14] Forces for YM improved open bc --- src/YM/YMact.jl | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index 500ae0f..f9849bd 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -231,11 +231,10 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, g2 = U[b,id2,r]\U[b,id1,r] if !TOBC && ( (it == 1) || (it == lp.iL[end]) ) - X = 0.5*cG*(c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + - c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + c1*projalg(h1*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*g1/h4)) + X = 0.5*cG*(c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) ) - frc1[b,id1,r] -= X - frc1[b,id2,r] += X + frc1[b,id1,r] -= X + 0.5*cG*c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id2,r] += X + 0.5*cG*c1*projalg(h1*g1/U[b,id2,r]) frc2[bu1,id2,ru1] -= 0.5*cG*c0*projalg(g1*g2) frc2[bu2,id1,ru2] += 0.5*cG*c0*projalg(g2*g1) frc2[bu1,id2,ru1] -= 0.5*cG*c1*projalg((g1/U[b,id2,r])*h1) @@ -246,12 +245,22 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, frc2[bu2,id1,ru2] += 0.5*cG*c1*projalg(h4\U[b,id1,r]*g1) elseif TOBC && (it == lp.iL[end]) nothing - elseif TOBC && (it == (lp.iL[end]-1) ) - X = (c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + - c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + c1*projalg(h1*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*g1/h4)) + elseif TOBC && (it == 1) + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*h2/(U[b,id2,r]*U[bu2,id1,ru2])) + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) frc1[b,id1,r] -= X - frc1[b,id2,r] += X + frc1[b,id2,r] += X + c1*projalg(h1*g1/U[b,id2,r]) + frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) + frc2[bu2,id1,ru2] += c0*projalg(g2*g1) + frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) + frc2[bu2,id1,ru2] += c1*projalg((U[b,id2,r]\h1)*g1) + frc2[bu2,id1,ru2] += c1*projalg(g2*h2/U[bu2,id1,ru2]) + frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) + elseif TOBC && (it == (lp.iL[end]-1) ) + X = c0*projalg(U[b,id1,r]*g1/U[b,id2,r]) + c1*projalg(U[b,id1,r]*ga/(U[b,id2,r]*h3)) + + frc1[b,id1,r] -= X + c1*projalg(U[b,id1,r]*g1/h4) + frc1[b,id2,r] += X + c1*projalg(h1*g1/U[b,id2,r]) frc2[bu1,id2,ru1] -= c0*projalg(g1*g2) frc2[bu2,id1,ru2] += c0*projalg(g2*g1) frc2[bu1,id2,ru1] -= c1*projalg((g1/U[b,id2,r])*h1) @@ -259,7 +268,6 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, frc2[bu1,id2,ru1] -= c1*projalg((ga/h3)*g2) frc2[bu1,id2,ru1] -= c1*projalg((g1/h4)*U[b,id1,r]) frc2[bu2,id1,ru2] += c1*projalg(h4\U[b,id1,r]*g1) - else if TWP X = projalg(c0*ztw,U[b,id1,r]*g1/U[b,id2,r]) From 65b00fa0154301ea309b4db0d46843ae4d1ad8fc Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Fri, 14 Jun 2024 11:41:53 +0200 Subject: [PATCH 05/14] Proper normalization of noise sources --- src/Dirac/Diracfields.jl | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/Dirac/Diracfields.jl b/src/Dirac/Diracfields.jl index 488e05a..5559ca3 100644 --- a/src/Dirac/Diracfields.jl +++ b/src/Dirac/Diracfields.jl @@ -123,7 +123,7 @@ Randomizes the SU2fund / SU3fund fermion field. If the argument t is present, it function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::SpaceParm{4,6,BC_PERIODIC,D}, t::Int64 = 0) where {T,D} @timeit "Randomize pseudofermion field" begin - p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) end @@ -135,7 +135,7 @@ end function pfrandomize!(f::AbstractArray{Spinor{4, SU3fund{T}}}, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}, t::Int64 = 0) where {T,D} @timeit "Randomize pseudofermion field" begin - p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su3!(f,p,lp,t) end @@ -166,10 +166,10 @@ function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64) return nothing end -function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}},lp::SpaceParm, t::Int64=0) where {T} +function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}}, lp::SpaceParm{4,6,BC_PERIODIC,D}, t::Int64 = 0) where {T,D} @timeit "Randomize pseudofermion field" begin - p = ntuple(i->CUDA.randn(T, lp.bsz, 2, lp.rsz,2),4) # complex generation not suported for Julia 1.5.4 + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su2!(f,p,lp,t) end @@ -178,6 +178,19 @@ function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}},lp::SpaceParm, t:: return nothing end +function pfrandomize!(f::AbstractArray{Spinor{4, SU2fund{T}}}, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}, t::Int64 = 0) where {T,D} + + @timeit "Randomize pseudofermion field" begin + p = ntuple(i->CUDA.randn(T, lp.bsz, 3, lp.rsz,2),4)./sqrt(2) # complex generation not suported for Julia 1.5.4 + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf_su2!(f,p,lp,t) + end + end + SF_bndfix!(f,lp) + + return nothing +end + function krnl_assign_pf_su2!(f::AbstractArray, p , lp::SpaceParm, t::Int64) @inbounds begin From 5fc81739f8fb87be2ae9b3a036d512b20125db03 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Mon, 9 Sep 2024 11:21:41 +0200 Subject: [PATCH 06/14] pfrandomize modification and minor typos. --- src/Dirac/Diracfields.jl | 28 ++++++++++++++++------------ src/YM/YMflow.jl | 2 +- src/YM/YMsf.jl | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/Dirac/Diracfields.jl b/src/Dirac/Diracfields.jl index 5559ca3..3210460 100644 --- a/src/Dirac/Diracfields.jl +++ b/src/Dirac/Diracfields.jl @@ -151,15 +151,17 @@ function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64) b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) - if t == 0 + if t == 0 f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2], - x[b,3,r,1] + im* x[b,3,r,2]),p)) - elseif point_time((b,r),lp) == t + x[b,2,r,1] + im* x[b,2,r,2], + x[b,3,r,1] + im* x[b,3,r,2]),p)) + elseif point_time((b,r),lp) == t f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2], - x[b,3,r,1] + im* x[b,3,r,2]),p)) - end + x[b,2,r,1] + im* x[b,2,r,2], + x[b,3,r,1] + im* x[b,3,r,2]),p)) + else + f[b,r] = 0.0*f[b,r] + end end @@ -197,13 +199,15 @@ function krnl_assign_pf_su2!(f::AbstractArray, p , lp::SpaceParm, t::Int64) b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) - if t == 0 + if t == 0 f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2]),p)) - elseif point_time((b,r),lp) == t + x[b,2,r,1] + im* x[b,2,r,2]),p)) + elseif point_time((b,r),lp) == t f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2], - x[b,2,r,1] + im* x[b,2,r,2]),p)) - end + x[b,2,r,1] + im* x[b,2,r,2]),p)) + else + f[b,r] = 0.0*f[b,r] + end end diff --git a/src/YM/YMflow.jl b/src/YM/YMflow.jl index 42ed545..645da0f 100644 --- a/src/YM/YMflow.jl +++ b/src/YM/YMflow.jl @@ -93,7 +93,7 @@ function Base.show(io::IO, int::FlowIntr{N,T}) where {N,T} if N == 0 println(io, " * Euler schem3") elseif N == 1 - println(io, " * One stage scheme. Coefficients3") + println(io, " * One stage scheme. Coefficients") println(io, " stg 1: ", int.e0[1], " ", int.e1[1]) elseif N == 2 println(io, " * Two stage scheme. Coefficients:") diff --git a/src/YM/YMsf.jl b/src/YM/YMsf.jl index abd93d9..cd00ac7 100644 --- a/src/YM/YMsf.jl +++ b/src/YM/YMsf.jl @@ -92,7 +92,7 @@ end """ function setbndfield(U, phi, lp::SpaceParm) -Sets abelian boundary fields with phases `phi[1]` and `phi[2]` to the configuration `U` at time salice ``x_0=0``. +Sets abelian boundary fields with phases `phi[1]` and `phi[2]` to the configuration `U` at time slice ``x_0=0``. """ function setbndfield(U, phi, lp::SpaceParm{N,M,B,D}) where {N,M,B,D} From 6f1c8a08fd2f5fd6e56c97290d8c23cebb495d78 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Mon, 9 Sep 2024 15:37:42 +0200 Subject: [PATCH 07/14] Memory optimization in bfl. Fermion adaptive step. --- src/Dirac/Diracflow.jl | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/Dirac/Diracflow.jl b/src/Dirac/Diracflow.jl index 8065840..62529e2 100644 --- a/src/Dirac/Diracflow.jl +++ b/src/Dirac/Diracflow.jl @@ -108,8 +108,7 @@ function bflw_step!(psi, U, eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam @timeit "Backflow step" begin - V = copy(U) - V .= U + @timeit "GPU to CPU" V = Array(U) force_gauge(ymws, U, int.c0, 1, gp, lp) @@ -131,7 +130,7 @@ function bflw_step!(psi, U, eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam Nablanabla!(dws.sp, U, 0.75*2*eps*psi, dpar, dws, lp) - U .= V + @timeit "CPU to GPU" copyto!(U,V) force_gauge(ymws, U, int.c0, 1, gp, lp) @@ -144,7 +143,7 @@ function bflw_step!(psi, U, eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam Nablanabla!(dws.sAp, U, 2*eps*dws.sp, dpar, dws, lp) dws.sAp .= psi + (8/9)*dws.sAp - U .= V + @timeit "CPU to GPU" copyto!(U,V) Nablanabla!(psi, U, 2*eps*(dws.sAp - (8/9)*dws.sp), dpar, dws, lp) psi .= (1/4)*psi + dws.sp + dws.sAp @@ -166,8 +165,9 @@ function flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, epsini::T, gp::GaugePar if ns > 10 flw(U, psi, int, 9, eps, gp, dpar, lp, ymws, dws) ymws.U1 .= U + dws.sr .= psi flw(U, psi, int, 1, eps, gp, dpar, lp, ymws, dws) - flw(ymws.U1, int, 2, eps/2, gp, lp, ymws) + flw(ymws.U1,dws.sr, int, 2, eps/2, gp, dpar,lp, ymws,dws) dt = dt - 10*eps nstp = nstp + 10 @@ -175,8 +175,10 @@ function flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, epsini::T, gp::GaugePar # adjust step size ymws.U1 .= ymws.U1 ./ U + dws.sr .= dws.sr .- psi maxd = CUDA.mapreduce(dev_one, max, ymws.U1, init=zero(tend)) - eps = min(int.max_eps, 2*eps, int.sft_fac*eps*(int.tol/maxd)^(one(tend)/3)) + pfdist = sqrt(CUDA.mapreduce(norm2, max, dws.sr, init=zero(tend))) + eps = min(int.max_eps, 2*eps, int.sft_fac*eps*(int.tol/maxd)^(one(tend)/3),int.sft_fac*eps*(int.tol/pfdist)^(one(tend)/3)) else flw(U, psi, int, ns, eps, gp, dpar, lp, ymws, dws) @@ -205,7 +207,7 @@ flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, gp::GaugeParm, dpar::DiracParam, function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) -Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `si`. +Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `so`. """ function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D} @@ -216,6 +218,7 @@ function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Space end return nothing end + function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}) where {D} SF_bndfix!(si,lp) @timeit "Laplacian" begin @@ -238,7 +241,7 @@ function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D} so[b,r] = -4*si[b,r] - bu1, ru1 = up((b,r), 1, lp) + bu1, ru1 = up((b,r), 1, lp) bd1, rd1 = dw((b,r), 1, lp) bu2, ru2 = up((b,r), 2, lp) bd2, rd2 = dw((b,r), 2, lp) @@ -313,7 +316,6 @@ function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Sp end - export Nablanabla!, flw, backflow, flw_adapt, bflw_step! @@ -362,7 +364,6 @@ function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceP return nothing end - function krnl_g5Dslsh!(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) @@ -393,7 +394,6 @@ function krnl_g5Dslsh!(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Spac return nothing end - function krnl_g5Dslsh!(so, U, si, th, lp::SpaceParm{4,6,B,D}) where {D,B} b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) @@ -436,8 +436,6 @@ function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::SpaceParm{4,6,B,D}) where {B, return nothing end - - function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D} @inbounds begin From c047ccf33c0605bc31e1e646a203d4a412a872ee Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Mon, 9 Sep 2024 15:42:38 +0200 Subject: [PATCH 08/14] bfl_error function --- src/Dirac/Diracflow.jl | 21 +++++++++++++++++++++ src/LatticeGPU.jl | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/Dirac/Diracflow.jl b/src/Dirac/Diracflow.jl index 62529e2..980a12a 100644 --- a/src/Dirac/Diracflow.jl +++ b/src/Dirac/Diracflow.jl @@ -318,6 +318,27 @@ end export Nablanabla!, flw, backflow, flw_adapt, bflw_step! +""" + function bfl_error(psi_t, psi_0, U, tend, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) + +Estimates the error of the backflow integration of psi_t into psi_0 with a random noise source. +""" +function bfl_error(psi_t, psi_0, U, tend, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) + + pfrandomize!(dws.sr,lp) + @timeit "GPU to CPU" V = Array(U) + + R0 = sum(dot.(psi_0,dws.sr)) + + flw_adapt(U, dws.sr, int, tend, int.eps_ini/2, gp, dpar, lp, ymws, dws) + + R1 = sum(dot.(psi_t,dws.sr)) + @timeit "CPU to GPU" copyto!(U,V) + + return abs(R0-R1) +end + +export bfl_error """ function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D}) diff --git a/src/LatticeGPU.jl b/src/LatticeGPU.jl index 46f5ac6..0b830ee 100644 --- a/src/LatticeGPU.jl +++ b/src/LatticeGPU.jl @@ -60,7 +60,7 @@ using .Dirac export DiracWorkspace, DiracParam export Dw!, g5Dw!, DwdagDw!, SF_bndfix!, Csw!, pfrandomize!, mtwmdpar export read_prop, save_prop, read_dpar -export Nablanabla!, flw, backflow +export Nablanabla!, flw, backflow, bfl_error include("Solvers/Solvers.jl") using .Solvers From 6e944cc9d833bbb9d04f207145d619f78d03f142 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Wed, 11 Sep 2024 14:55:00 +0200 Subject: [PATCH 09/14] Integrator added as input in backflow --- src/Dirac/Diracflow.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Dirac/Diracflow.jl b/src/Dirac/Diracflow.jl index 980a12a..ee4488e 100644 --- a/src/Dirac/Diracflow.jl +++ b/src/Dirac/Diracflow.jl @@ -41,13 +41,13 @@ flw(U, psi, int::FlowIntr{NI,T}, ns::Int64, gp::GaugeParm, dpar::DiracParam, lp: """ function backflow(psi, U, Dt, nsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) -Performs one step back in flow time for the fermion field, according to 1302.5246. The fermion field must me that of the time-slice Dt and is flowed back to the first time-slice +Performs the integration of the adjoint flow for the fermion field, according to 1302.5246. The fermion field must me that of the time-slice Dt and is flowed back to the first time-slice nsave is the total number of gauge fields saved in the process """ -function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) +function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm,int::FlowIntr, ymws::YMworkspace, dws::DiracWorkspace) - int = wfl_rk3(Float64,0.01,1.0) # Default integrator, it has to be order 3 rk but in can be zfl + # Default integrator is wfl_rk3(Float64,0.01,1.0), it has to be order 3 rk but in can be zfl @timeit "Backflow integration" begin @timeit "GPU to CPU" U0 = Array(U) @@ -98,6 +98,7 @@ function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, return nothing end +backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) = backflow(psi, U, Dt, maxnsave, gp, dpar, lp, wfl_rk3(Float64,0.01,1.0), ymws, dws) """ function bflw_step!(U, psi, eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) @@ -321,7 +322,7 @@ export Nablanabla!, flw, backflow, flw_adapt, bflw_step! """ function bfl_error(psi_t, psi_0, U, tend, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) -Estimates the error of the backflow integration of psi_t into psi_0 with a random noise source. +Estimates the error of the backflow integration of `\\psi\\_t` into `\\psi\\_0` with a random noise source. """ function bfl_error(psi_t, psi_0, U, tend, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) From 263ae073617dfc399f59b10337b949eaead4b362 Mon Sep 17 00:00:00 2001 From: "Fernando P. Panadero" Date: Wed, 20 Nov 2024 12:42:15 +0100 Subject: [PATCH 10/14] Updated enviroment --- Manifest.toml | 429 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 344 insertions(+), 85 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 043e57b..fc76e61 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -6,23 +6,35 @@ uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" version = "0.0.1" [[AbstractFFTs]] -deps = ["ChainRulesCore", "LinearAlgebra"] -git-tree-sha1 = "6f1d9bc1c08f9f4a8fa92e3ea3cb50153a1b40d4" +deps = ["ChainRulesCore", "LinearAlgebra", "Test"] +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.1.0" +version = "1.5.0" + +[[AbstractTrees]] +git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c" +uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +version = "0.4.4" [[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "af92965fb30777147966f58acb05da51c5616b5f" +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.3" +version = "3.7.2" [[ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +[[Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + [[BDIO]] deps = ["Documenter", "Nettle", "Test"] git-tree-sha1 = "45f43efe91dcda1939cfa5b0b028ea941afdc367" @@ -33,45 +45,97 @@ version = "0.1.0" [[BFloat16s]] deps = ["LinearAlgebra", "Printf", "Random", "Test"] -git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072" +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -version = "0.2.0" +version = "0.4.2" [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" [[CEnum]] -git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" +git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.2" +version = "0.5.0" [[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "429a1a05348ce948a96adbdd873fbe6d9e5e052f" +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "76582ae19006b1186e87dadd781747f76cead72c" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.6.2" +version = "5.1.1" + +[[CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "1e42ef1bdb45487ff28de16182c0df4920181dc3" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.7.0+0" + +[[CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" +version = "0.2.2" + +[[CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "9704e50c9158cf8896c2776b8dbc5edd136caf80" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.10.1+0" [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "9489214b993cd42d17f44c36e359bf6a7c919abf" +git-tree-sha1 = "2118cb2765f8197b08e5958cdd17c165427425ee" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.15.0" +version = "1.19.0" -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "1e315e3f4b0b7ce40feded39c73049692126cf53" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" -version = "0.1.3" +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.11.4" + +[[Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] +git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.12.10" [[Compat]] deps = ["Dates", "LinearAlgebra", "UUIDs"] -git-tree-sha1 = "924cdca592bc16f14d2f7006754a621735280b74" +git-tree-sha1 = "886826d76ea9e72b35fcd000e535588f7b60f21d" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.1.0" +version = "4.10.1" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.1+0" + +[[Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + +[[DataAPI]] +git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.15.0" + +[[DataFrames]] +deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "1.6.1" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.15" + +[[DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" [[Dates]] deps = ["Printf"] @@ -79,88 +143,156 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" [[DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" [[Documenter]] -deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "f7809f532671564e48cd81627ddcfb1ba670f87d" +deps = ["ANSIColoredPrinters", "AbstractTrees", "Base64", "Dates", "DocStringExtensions", "Downloads", "Git", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "MarkdownAST", "Pkg", "PrecompileTools", "REPL", "RegistryInstances", "SHA", "Test", "Unicode"] +git-tree-sha1 = "2613dbec8f4748273bbe30ba71fd5cb369966bac" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.27.19" +version = "1.2.1" [[Downloads]] deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" + +[[Expat_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "4558ab818dcceaab612d1bb8c19cee87eda2b83c" +uuid = "2e619515-83b5-522b-bb60-26c02a35a201" +version = "2.5.0+0" [[ExprTools]] -git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d" +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.8" +version = "0.1.10" [[FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" +[[FixedPointNumbers]] +deps = ["Statistics"] +git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.8.4" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + [[GMP_jll]] deps = ["Artifacts", "Libdl"] uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d" +version = "6.2.1+2" [[GPUArrays]] -deps = ["Adapt", "LLVM", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "c783e8883028bf26fb05ed4022c450ef44edd875" +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "85d7fb51afb3def5dcb85ad31c3707795c8bccc1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.3.2" +version = "9.1.0" + +[[GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" [[GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "647a54f196b5ffb7c3bc2fec5c9a57fa273354cc" +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "a846f297ce9d09ccba02ead0cae70690e072a119" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.13.14" +version = "0.25.0" + +[[Git]] +deps = ["Git_jll"] +git-tree-sha1 = "51764e6c2e84c37055e846c516e9015b4a291c7d" +uuid = "d7ba0133-e1db-5d97-8f8c-041e4b3a1eb2" +version = "1.3.0" + +[[Git_jll]] +deps = ["Artifacts", "Expat_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Libiconv_jll", "OpenSSL_jll", "PCRE2_jll", "Zlib_jll"] +git-tree-sha1 = "bb8f7cc77ec1152414b2af6db533d9471cfbb2d1" +uuid = "f8c6e375-362e-5223-8a59-34ff63f689eb" +version = "2.42.0+0" [[IOCapture]] deps = ["Logging", "Random"] -git-tree-sha1 = "f7be53659ab06ddc986428d3a9dcc95f6fa6705a" +git-tree-sha1 = "d75853a0bdbfb1ac815478bacd89cd27b550ace6" uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" -version = "0.2.2" +version = "0.2.3" + +[[InlineStrings]] +deps = ["Parsers"] +git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.4.0" [[InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "b3364212fb5d870f724876ffcd34dd8ec6d98918" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.7" +[[InvertedIndices]] +git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.3.0" -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" -uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +[[IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" [[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.4.1" +version = "1.5.0" [[JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e" +git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.3" +version = "0.21.4" + +[[JuliaNVTXCallbacks_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f" +uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e" +version = "0.2.1+0" + +[[KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "653e0824fc9ab55b3beec67a6dbbe514a65fb954" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.15" [[LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "e7e9184b0bf0158ac4e4aa9daf00041b5909bf1a" +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"] +git-tree-sha1 = "0678579657515e88b6632a3a482d39adcbb80445" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.14.0" +version = "6.4.1" [[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"] -git-tree-sha1 = "771bfe376249626d3ca12bcd58ba243d3f961576" +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "98eaee04d96d973e79c25d49167668c5c8fb50e2" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.16+0" +version = "0.0.27+1" + +[[LLVMLoopInfo]] +git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea" +uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586" +version = "1.0.0" + +[[LaTeXStrings]] +git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec" +uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +version = "1.3.1" + +[[LazilyInitializedFields]] +git-tree-sha1 = "8f7f3cabab0fd1800699663533b6d5cb3fc0e612" +uuid = "0e77f7df-68c5-4e49-93ce-4cd80f5598bf" +version = "1.2.2" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] @@ -169,10 +301,12 @@ uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" [[LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" [[LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] @@ -181,42 +315,75 @@ uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.17.0+0" + [[LinearAlgebra]] deps = ["Libdl", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "09e4b894ce6a976c354a69041a04748180d43637" -uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.15" - [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.11" + [[Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +[[MarkdownAST]] +deps = ["AbstractTrees", "Markdown"] +git-tree-sha1 = "465a70f0fc7d443a00dcdc3267a497397b8a3899" +uuid = "d0879d2d-cac2-40c8-9cee-1863dc0c7391" +version = "0.1.2" + [[MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.0+0" + +[[Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "1.1.0" [[Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.2.1" + +[[NVTX]] +deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"] +git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34" +uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +version = "0.3.3" + +[[NVTX_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b" +uuid = "e98f9f5b-d649-5603-91fd-7774390e6439" +version = "3.1.0+2" [[Nettle]] deps = ["Libdl", "Nettle_jll"] -git-tree-sha1 = "a68340b9edfd98d0ed96aee8137cb716ea3b6dea" +git-tree-sha1 = "6fa48cbae828267848ee32c1bb31d1652e210d7d" uuid = "49dea1ee-f6fa-5aa6-9a11-8816cee7d4b9" -version = "0.5.1" +version = "1.0.0" [[Nettle_jll]] deps = ["Artifacts", "GMP_jll", "JLLWrappers", "Libdl", "Pkg"] @@ -226,36 +393,63 @@ version = "3.7.2+0" [[NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" [[OpenBLAS_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.20+0" -[[OpenLibm_jll]] +[[OpenSSL_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "cc6e1927ac521b659af340e0ca45828a3ffc748f" +uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" +version = "3.0.12+0" + +[[OrderedCollections]] +git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.6.3" + +[[PCRE2_jll]] deps = ["Artifacts", "Libdl"] -uuid = "05823500-19ac-5b8b-9628-191a04bc5112" - -[[OpenSpecFun_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.5+0" +uuid = "efcefdf7-47ab-520b-bdef-62a2eaa19f15" +version = "10.40.0+0" [[Parsers]] -deps = ["Dates"] -git-tree-sha1 = "1285416549ccfcdf0c50d4997a94331e88d68413" +deps = ["Dates", "PrecompileTools", "UUIDs"] +git-tree-sha1 = "a935806434c9d4c506ba941871b327b96d41f2bf" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.3.1" +version = "2.8.0" [[Pkg]] deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.8.0" + +[[PooledArrays]] +deps = ["DataAPI", "Future"] +git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "1.4.3" + +[[PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" [[Preferences]] deps = ["TOML"] -git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.3.0" +version = "1.4.1" + +[[PrettyTables]] +deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "88b895d13d53b5577fd53379d913b9ab9ac82660" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "2.3.1" [[Printf]] deps = ["Unicode"] @@ -271,9 +465,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[Random123]] deps = ["Random", "RandomNumbers"] -git-tree-sha1 = "afeacaecf4ed1649555a19cb2cad3c141bbc9474" +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.5.0" +version = "1.6.1" [[RandomNumbers]] deps = ["Random", "Requires"] @@ -286,6 +480,12 @@ git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" +[[RegistryInstances]] +deps = ["LazilyInitializedFields", "Pkg", "TOML", "Tar"] +git-tree-sha1 = "ffd19052caf598b8653b99404058fce14828be51" +uuid = "2792f1a3-b283-48e8-9a74-f99dce5104f3" +version = "0.1.0" + [[Requires]] deps = ["UUIDs"] git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" @@ -294,6 +494,19 @@ version = "1.3.0" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.1" + +[[SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "0e7508ff27ba32f26cd459474ca2ede1bc10991f" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.4.1" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -301,27 +514,58 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" +[[SortingAlgorithms]] +deps = ["DataStructures"] +git-tree-sha1 = "5165dfb9fd131cf0c6957a3a7605dede376e7b63" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "1.2.0" + [[SparseArrays]] deps = ["LinearAlgebra", "Random"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "a9e798cae4867e3a41cae2dd9eb60c047f1212db" -uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.1.6" +[[StaticArrays]] +deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore", "Statistics"] +git-tree-sha1 = "fba11dbe2562eecdfcac49a05246af09ee64d055" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.8.1" + +[[StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[[StringManipulation]] +deps = ["PrecompileTools"] +git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5" +uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" +version = "0.3.4" + [[TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.0" + +[[TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.1" + +[[Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"] +git-tree-sha1 = "cb76cf677714c095e535e3501ac7954732aeea2d" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "1.11.1" [[Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.1" [[Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] @@ -340,18 +584,33 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" +[[UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" + [[Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.12+3" [[libblastrampoline_jll]] deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.1.1+0" [[nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" [[p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" From 3a251dd1b80da53e0c1d63a414b30f6d366f4876 Mon Sep 17 00:00:00 2001 From: "Fernando P. Panadero" Date: Wed, 20 Nov 2024 12:43:11 +0100 Subject: [PATCH 11/14] Updated tests --- test/dirac/test_backflow.jl | 26 ++++++++++++-------------- test/dirac/test_backflow_tl.jl | 17 ++++++++--------- test/dirac/test_flow_tl.jl | 15 ++++++++------- test/dirac/test_fp_fa.jl | 4 +++- test/dirac/test_solver_plw.jl | 10 ++++++---- test/dirac/test_solver_rand.jl | 5 ++++- test/runtests.jl | 6 +++++- 7 files changed, 46 insertions(+), 37 deletions(-) diff --git a/test/dirac/test_backflow.jl b/test/dirac/test_backflow.jl index 601c276..0700b0e 100644 --- a/test/dirac/test_backflow.jl +++ b/test/dirac/test_backflow.jl @@ -1,13 +1,8 @@ -using CUDA +using CUDA, LatticeGPU -using Pkg - -Pkg.activate("/home/fperez/Git/LGPU_fork_ferflow") - -using LatticeGPU - -lp = SpaceParm{4}((4,4,4,4),(2,2,2,2),0,(0,0,0,0,0,0)); +println(" # Consistency condition for backflow") +lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), BC_PERIODIC, (0,0,0,0,0,0)) pso = scalar_field(Spinor{4,SU3fund{Float64}},lp); psi = scalar_field(Spinor{4,SU3fund{Float64}},lp); psi2 = scalar_field(Spinor{4,SU3fund{Float64}},lp); @@ -19,24 +14,27 @@ int = wfl_rk3(Float64, 0.01, 1.0) gp = GaugeParm{Float64}(SU3{Float64},6.0,1.0,(1.0,0.0),(0.0,0.0),lp.iL) -dpar = DiracParam{Float64}(SU3fund,1.3,0.9,(1.0,1.0,1.0,1.0),0.0) +dpar = DiracParam{Float64}(SU3fund,1.3,0.9,(1.0,1.0,1.0,1.0),0.0,0.0) randomize!(ymws.mom, lp, ymws) U = exp.(ymws.mom); pfrandomize!(psi,lp) -for L in 4:19 +for L in 10:20:210 pso .= psi V = Array(U) - a,b = flw_adapt(U, psi, int, L*int.eps, gp,dpar, lp, ymws,dws) + #a,b = flw_adapt(U, psi, int, L*int.eps, gp,dpar, lp, ymws,dws) + flw(U, psi, int, L,int.eps, gp,dpar, lp, ymws,dws) # for i in 1:a # flw(U, psi, int, 1 ,b[i], gp, dpar, lp, ymws, dws) # end pfrandomize!(psi2,lp) - foo = sum(dot.(psi,psi2))# field_dot(psi,psi2,sumf,lp) + foo = sum(dot.(psi,psi2)) copyto!(U,V); - backflow(psi2,U,L*int.eps,7,gp,dpar,lp, ymws,dws) - println("Error:",(sum(dot.(pso,psi2))-foo)/foo) + backflow(psi2,U,L*int.eps,20,gp,dpar,lp, ymws,dws) + println("# Consistency backflow test for t=",L*int.eps) + println("Relative error:",abs((sum(dot.(pso,psi2))-foo)/foo)) psi .= pso end + diff --git a/test/dirac/test_backflow_tl.jl b/test/dirac/test_backflow_tl.jl index 2bf0d18..9330147 100644 --- a/test/dirac/test_backflow_tl.jl +++ b/test/dirac/test_backflow_tl.jl @@ -3,13 +3,14 @@ using LatticeGPU, CUDA, TimerOutputs #Test for the relation K(t,y;0,n)^+ Dw(n|m)^{-1} e^(ipm) = D(p)^{-1} exp(4t sin^2(p/2)) e^{ipn} with a given momenta (if p=0 its randomized), spin and color #Kernel en 1207.2096 +println(" # Free fermion propagator for backflow") @timeit "Plw backflow test" begin function Dwpw_test(;p=0,s=1,c=1) lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), 0, (0,0,0,0,0,0)) gp = GaugeParm{Float64}(SU3{Float64}, 6.0, 1.0) - dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0) + dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0,0.0) dws = DiracWorkspace(SU3fund,Float64,lp); ymws = YMworkspace(SU3,Float64,lp); @@ -89,9 +90,7 @@ using LatticeGPU, CUDA, TimerOutputs g5Dw!(prop,U,pwave,dpar,dws,lp) CG!(prop,U,DwdagDw!,dpar,lp,dws,10000,1.0e-14) - for _ in 1:Nsteps - backflow(U,prop,1,int.eps,gp,dpar,lp, ymws,dws) - end + backflow(prop,U,Nsteps*int.eps,20,gp,dpar,lp, ymws,dws) end @@ -103,15 +102,15 @@ using LatticeGPU, CUDA, TimerOutputs begin - dif = 0.0 + global diff = 0.0 for i in 1:3 for j in 1:4 - dif += Dwpw_test(c=i,s=j) + global diff += Dwpw_test(c=i,s=j) end end - if dif < 1.0e-5 - print("Backflow_tl test passed with average error ", dif/12,"!\n") + if diff < 1.0e-5 + print("Backflow_tl test passed with average error ", diff/12,"\n") else - error("Backflow_tl test failed with difference: ",dif,"\n") + error("Backflow_tl test failed with difference: ",diff,"\n") end diff --git a/test/dirac/test_flow_tl.jl b/test/dirac/test_flow_tl.jl index 65ed678..b73be6b 100644 --- a/test/dirac/test_flow_tl.jl +++ b/test/dirac/test_flow_tl.jl @@ -1,15 +1,16 @@ using LatticeGPU, CUDA, TimerOutputs #Test for the relation K(t,y;0,n) Dw(n|m)^{-1} e^(ipm) = D(p)^{-1} exp(-4t sin^2(p/2)) e^{ipn} with a given momenta (if p=0 its randomized), spin and color -#Kernel en 1207.2096 +#Kernel from 1207.2096 +println(" # Free fermion propagator for frontflow") @timeit "Plw flow test" begin function Dwpw_test(;p=0,s=1,c=1) lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), 0, (0,0,0,0,0,0)) gp = GaugeParm{Float64}(SU3{Float64}, 6.0, 1.0) - dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0) + dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0,0.0) dws = DiracWorkspace(SU3fund,Float64,lp); ymws = YMworkspace(SU3,Float64,lp); @@ -103,15 +104,15 @@ using LatticeGPU, CUDA, TimerOutputs begin - dif = 0.0 + global diff = 0.0 for i in 1:3 for j in 1:4 - dif += Dwpw_test(c=i,s=j) + global diff += Dwpw_test(c=i,s=j) end end - if dif < 1.0e-4 - print("Flow_tl test passed with average error ", dif/12,"!\n") + if diff < 1.0e-4 + print("Flow_tl test passed with average error ", diff/12,"\n") else - error("Flow_tl test failed with difference: ",dif,"\n") + error("Flow_tl test failed with difference: ",diff,"\n") end diff --git a/test/dirac/test_fp_fa.jl b/test/dirac/test_fp_fa.jl index 5551da5..a7fa00c 100644 --- a/test/dirac/test_fp_fa.jl +++ b/test/dirac/test_fp_fa.jl @@ -2,6 +2,8 @@ using LatticeGPU using CUDA using TimerOutputs +println(" # Free solution for SF correlation functions") + @timeit "fA_fP test" begin @@ -115,7 +117,7 @@ using TimerOutputs elseif difP > 1.0e-15 error("fP test failed with error ", difP) else - print("fA & fP tests passed with errors: ", difA," and ",difP,"!\n") + print("fA & fP tests passed with errors: ", difA," and ",difP,"\n") end end diff --git a/test/dirac/test_solver_plw.jl b/test/dirac/test_solver_plw.jl index a4de0c7..7abc62a 100644 --- a/test/dirac/test_solver_plw.jl +++ b/test/dirac/test_solver_plw.jl @@ -2,6 +2,8 @@ using LatticeGPU, CUDA, TimerOutputs #Test for the relation Dw(n|m)^{-1} e^(ipm) = D(p)^{-1} e^{ipn} with a given momenta (if p=0 its randomized), spin and color +println(" # Test for free fermion propagator") + @timeit "Plw test" begin function Dwpw_test(;p=0,s=1,c=1) @@ -84,12 +86,12 @@ end dif = sum(norm2.(prop - prop_th)) -if dif > 1.0e-15 +if dif > 1.0e-7 error("Dwpl test for s=",s,", c=",c," failed with difference: ",dif,"\n") end -return dif +return sqrt(dif) end @@ -101,8 +103,8 @@ for i in 1:3 for j in 1:4 global diff += Dwpw_test(c=i,s=j) end end -if diff < 1.0e-15 - print("Dwpl test passed with average error ", diff/12,"!\n") +if diff < 1.0e-7 + print("Dwpl test passed with average error ", diff/12,"\n") else error("Dwpl test failed with difference: ",diff,"\n") end diff --git a/test/dirac/test_solver_rand.jl b/test/dirac/test_solver_rand.jl index 0714d8f..3370e67 100644 --- a/test/dirac/test_solver_rand.jl +++ b/test/dirac/test_solver_rand.jl @@ -2,6 +2,9 @@ using CUDA, LatticeGPU, TimerOutputs #Check that Dw ( (DwdagDw)^{-1} g5 Dw g5 ) psi = psi for random fields +println(" # Test for the consistency of the solver") + + @timeit "Rand solver test" begin @timeit "Generate random fields" begin @@ -46,7 +49,7 @@ res = sum(norm2.(rpsi-dws.sp)) if res < 1.0e-6 - print("Drand test passed with ",res,"% error!\n") + print("Drand test passed with ",res,"% error\n") else error("Drand test failed with difference: ",res,"\n") diff --git a/test/runtests.jl b/test/runtests.jl index 2f68f70..31fea0b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,10 @@ -include("SAD/test_sad.jl") +#include("SAD/test_sad.jl") include("flow/test_adapt.jl") include("dirac/test_fp_fa.jl") include("dirac/test_solver_plw.jl") include("dirac/test_solver_rand.jl") +include("dirac/test_flow_tl.jl") +include("dirac/test_backflow_tl.jl") +include("dirac/test_backflow.jl") + From fd58ab5aaeed76143c2463b4d621078f3a9151a2 Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Fri, 22 Nov 2024 10:52:00 +0100 Subject: [PATCH 12/14] New test for fermion flow and some doc. --- docs/src/dirac.md | 10 ++++++++ test/dirac/test_adapt_ferm.jl | 48 +++++++++++++++++++++++++++++++++++ test/runtests.jl | 2 +- 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 test/dirac/test_adapt_ferm.jl diff --git a/docs/src/dirac.md b/docs/src/dirac.md index 2f7fade..d0ecc3a 100644 --- a/docs/src/dirac.md +++ b/docs/src/dirac.md @@ -20,6 +20,16 @@ The workspace stores four fermion fields, namely `.sr`, `.sp`, `.sAp` and `.st`, for different purposes. If the representation is either `SU2fund` of `SU3fund`, an extra field with values in `U2alg`/`U3alg` is created to store the clover, used for the improvement. +The functions using the fields allocated in [`DiracWorkspace`](@ref) are the following: + +- `dws.sr` : [`CG!`](@ref), [`flw_adapt`](@ref) (fermion case), [`bfl_error`](@ref) +- `dws.st` : [`DwdagDw!`](@ref), [`bflw_step_vec!`](@ref) +- `dws.sp` : [`CG!`](@ref), [`flw`](@ref) (fermion case), [`bflw_step!`](@ref), [`bflw_step_vec!`](@ref), [`propagator!`](@ref), [`bndpropagator!`](@ref), [`Tbndpropagator!`](@ref) +- `dws.sAp` : [`CG!`](@ref), [`flw`](@ref) (fermion case), [`bflw_step!`](@ref), [`bflw_step_vec!`](@ref) + +Note that other functions may call some of these functions, like [`flw_adapt`](@ref) depending on [`flw`](@ref), [`bflw!`](@ref) depending on [`bflw_step!`](@ref) or [`propagator!`](@ref) depending on [`CG!`](@ref). The fields used in the innermost function will also be modified by the outermost methods. + + ## Functions The functions [`Dw!`](@ref), [`g5Dw!`](@ref) and [`DwdagDw!`](@ref) are all related to the diff --git a/test/dirac/test_adapt_ferm.jl b/test/dirac/test_adapt_ferm.jl new file mode 100644 index 0000000..0368057 --- /dev/null +++ b/test/dirac/test_adapt_ferm.jl @@ -0,0 +1,48 @@ +using LatticeGPU, Test, CUDA + +T = Float64 +lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), BC_PERIODIC, (0,0,0,0,0,0)) +gp = GaugeParm{T}(SU3{T}, 6.1, 1.0) +dpar = DiracParam{T}(SU3fund,1.3,0.9,(1.0,1.0,1.0,1.0),0.0,0.0) +ymws = YMworkspace(SU3, T, lp) +dws = DiracWorkspace(SU3fund,T,lp); + +randomize!(ymws.mom, lp, ymws) +U = exp.(ymws.mom) + +psi = scalar_field(Spinor{4,SU3fund{T}},lp); +pfrandomize!(psi,lp) + +Ucp = deepcopy(U) +psicp = deepcopy(psi) +# First Integrate very precisely up to t=2 (Wilson) +println(" # Very precise integration ") +wflw = wfl_rk3(Float64, 0.0004, 1.0E-7) +flw(U,psi, wflw, 5000, gp,dpar, lp, ymws, dws) +pl_exact = Eoft_plaq(U, gp, lp, ymws) +cl_exact = Eoft_clover(U, gp, lp, ymws) +println(" - Plaq: ", pl_exact) +println(" - Clover: ", cl_exact) +Ufin = deepcopy(U) +psifin = deepcopy(psi) + + +# Now use Adaptive step size integrator: +for tol in (1.0E-4, 1.0E-5, 1.0E-6, 1.0E-7, 1.0E-8) + local wflw = wfl_rk3(Float64, 0.0001, tol) + U .= Ucp + psi .= psicp + ns, eps = flw_adapt(U,psi, wflw, 2.0, gp,dpar,lp, ymws,dws) + pl = Eoft_plaq(U, gp, lp, ymws) + cl = Eoft_clover(U, gp, lp, ymws) + psierr = sum(norm2.((psi.-psifin)))./prod(lp.iL) + + println(" # Adaptive integrator (tol=$tol): ", ns, " steps") + U .= U ./ Ufin + maxd = CUDA.mapreduce(dev_one, max, U, init=0.0) + println(" - Plaq: ", pl," [diff: ", abs(pl-pl_exact), "; ", + maxd, "]") + println(" - Clover: ", cl, " [diff: ", abs(cl-cl_exact), "; ", + maxd, "]") + println(" - Fermion diff: ", psierr) +end diff --git a/test/runtests.jl b/test/runtests.jl index 31fea0b..6ffdddf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,4 +7,4 @@ include("dirac/test_solver_rand.jl") include("dirac/test_flow_tl.jl") include("dirac/test_backflow_tl.jl") include("dirac/test_backflow.jl") - +include("dirac/test_adapt_ferm.jl") From 1c1a54348bbc000ca5fa1f004d57e0c134c5588a Mon Sep 17 00:00:00 2001 From: "Fernando P.Panadero" Date: Mon, 25 Nov 2024 11:44:34 +0100 Subject: [PATCH 13/14] Bug in plaq_pln --- src/YM/YMflow.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/YM/YMflow.jl b/src/YM/YMflow.jl index 645da0f..335c9d2 100644 --- a/src/YM/YMflow.jl +++ b/src/YM/YMflow.jl @@ -316,30 +316,30 @@ Eoft_plaq(U, gp::GaugeParm{T,G,NN}, lp::SpaceParm{N,M,B,D}, ymws::YMworkspace) w function krnl_plaq_pln!(plx, U::AbstractArray{T}, Ubnd, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} - + @inbounds begin b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) - + id1, id2 = lp.plidx[ipl] SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI)) && (id1 == N) TWP = ((I[id1]==1)&&(I[id2]==1)) - + bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) - - if SFBC && (ru1 != r) + + if SFBC && (point_time((b,r),lp) == lp.iL[end]) gt = Ubnd[id2] else gt = U[bu1,id2,ru1] end - + if TWP plx[I] = ztw*tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2])) else plx[I] = tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2])) - end + end end return nothing end From c453637e88f8d825f162de05fdee5e26147d3ee3 Mon Sep 17 00:00:00 2001 From: "Fernando P. Panadero" Date: Mon, 5 May 2025 11:30:02 +0200 Subject: [PATCH 14/14] Flow for OBC similar as in oqcd. --- src/LatticeGPU.jl | 2 +- src/YM/YM.jl | 2 +- src/YM/YMact.jl | 48 +++++++++++++++++++++++++++++++++++++++++++++++ src/YM/YMflow.jl | 25 ++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 2 deletions(-) diff --git a/src/LatticeGPU.jl b/src/LatticeGPU.jl index 46f5ac6..bf3995b 100644 --- a/src/LatticeGPU.jl +++ b/src/LatticeGPU.jl @@ -40,7 +40,7 @@ include("YM/YM.jl") using .YM export ztwist export YMworkspace, GaugeParm, force0_wilson!, field, field_pln, randomize!, zero!, norm2 -export force_gauge, MD! +export force_gauge, force_gauge_flw, MD! export gauge_action, hamiltonian, plaquette, HMC!, OMF4! export Eoft_clover, Eoft_plaq, Qtop export FlowIntr, wfl_euler, zfl_euler, wfl_rk2, zfl_rk2, wfl_rk3, zfl_rk3 diff --git a/src/YM/YM.jl b/src/YM/YM.jl index 16165e4..343ca5c 100644 --- a/src/YM/YM.jl +++ b/src/YM/YM.jl @@ -165,7 +165,7 @@ include("YMfields.jl") export randomize!, zero!, norm2 include("YMact.jl") -export krnl_plaq!, force_gauge, force_wilson +export krnl_plaq!, force_gauge, force_gauge_flw, force_wilson include("YMhmc.jl") export gauge_action, hamiltonian, plaquette, HMC!, MD! diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index f9849bd..a3316e4 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -320,6 +320,22 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, return nothing end +function bnd_rescale_flw!(frc1, lp::SpaceParm{N,M,BC_OPEN,D}) where {N,M,D} + + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] + + for id in 1:N-1 + if (((it == 1) || (it == lp.iL[4]))) + frc1[b,id,r] = 2*frc1[b,id,r] + end + end + end + return nothing +end ## ## SF @@ -918,6 +934,38 @@ end force_gauge(ymws::YMworkspace, U, c0, gp, lp) = force_gauge(ymws, U, c0, gp.cG[1], gp, lp) force_gauge(ymws::YMworkspace, U, gp, lp) = force_gauge(ymws, U, gp.c0, gp.cG[1], gp, lp) +""" + function force_gauge_flw(ymws::YMworkspace, U, c0, cG, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D}) + +Computes the force for the gauge flow with Open Boundaries. An aditional factor two in the boundaries +is included, see + +M. Luescher, S. Schaefer: "Lattice QCD with open boundary conditions and twisted-mass reweighting", Comput.Phys.Commun. 184 (2013) 519, + +for more details. + +""" +function force_gauge_flw(ymws::YMworkspace, U, c0, cG, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D}) where {NI,N,M,D} + + ztw = ztwist(gp, lp) + if abs(c0-1) < 1.0E-10 + @timeit "Wilson gauge force" begin + force_pln!(ymws.frc1, ymws.frc2, U, gp.Ubnd, cG, ztw, lp::SpaceParm) + end + else + @timeit "Improved gauge force" begin + force_pln!(ymws.frc1, ymws.frc2, U, gp.Ubnd, cG, ztw, lp::SpaceParm, c0) + end + end + + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz bnd_rescale_flw!(ymws.frc1,lp::SpaceParm) + end + + return nothing +end + + """ function force_wilson(ymws::YMworkspace, U, gp::GaugeParm, lp::SpaceParm) diff --git a/src/YM/YMflow.jl b/src/YM/YMflow.jl index 4f841c3..1c726b6 100644 --- a/src/YM/YMflow.jl +++ b/src/YM/YMflow.jl @@ -201,6 +201,31 @@ function flw(U, int::FlowIntr{NI,T}, ns::Int64, eps, gp::GaugeParm, lp::SpacePar end flw(U, int::FlowIntr{NI,T}, ns::Int64, gp::GaugeParm, lp::SpaceParm, ymws::YMworkspace) where {NI,T} = flw(U, int, ns, int.eps, gp, lp, ymws) +function flw(U, int::FlowIntr{NI,T}, ns::Int64, eps, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D}, ymws::YMworkspace) where {NI,T,N,M,D} + @timeit "Integrating flow equations" begin + for i in 1:ns + force_gauge_flw(ymws, U, int.c0, 1, gp, lp) + if int.add_zth + add_zth_term(ymws::YMworkspace, U, lp) + end + ymws.mom .= ymws.frc1 + U .= expm.(U, ymws.mom, 2*eps*int.r) + + for k in 1:NI + force_gauge_flw(ymws, U, int.c0, 1, gp, lp) + if int.add_zth + add_zth_term(ymws::YMworkspace, U, lp) + end + ymws.mom .= int.e0[k].*ymws.mom .+ int.e1[k].*ymws.frc1 + U .= expm.(U, ymws.mom, 2*eps) + end + end + end + + return nothing +end +flw(U, int::FlowIntr{NI,T}, ns::Int64, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D}, ymws::YMworkspace) where {NI,T,N,M,D} = flw(U, int, ns, int.eps, gp, lp, ymws) + ## # Adaptive step size integrators