diff --git a/src/Dirac/Dirac.jl b/src/Dirac/Dirac.jl index c2d1381..5ee8a6a 100644 --- a/src/Dirac/Dirac.jl +++ b/src/Dirac/Dirac.jl @@ -64,7 +64,7 @@ end function krnl_Dw!(so, U, si, m0, th, lp::SpaceParm{4,6,B,D}) where {B,D} - b, r = assign_thx() + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) # For SF: # - cttilde affects mass term at x0 = a, T-a @@ -85,7 +85,7 @@ end function krnl_g5Dw!(so, U, si, m0, th, lp::SpaceParm{4,6,B,D}) where {B,D} - b, r = assign_thx() + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) @inbounds begin so[b,r] = (4+m0)*si[b,r] diff --git a/src/LatticeGPU.jl b/src/LatticeGPU.jl index cc3725f..72886b6 100644 --- a/src/LatticeGPU.jl +++ b/src/LatticeGPU.jl @@ -23,7 +23,7 @@ include("Space/Space.jl") using .Space export SpaceParm -export up, dw, updw, point_coord, point_index, point_time, assign_thx +export up, dw, updw, point_coord, point_index, point_time export BC_PERIODIC, BC_OPEN, BC_SF_AFWB, BC_SF_ORBI include("Fields/Fields.jl") diff --git a/src/Space/Space.jl b/src/Space/Space.jl index 9061c6b..f610ec7 100644 --- a/src/Space/Space.jl +++ b/src/Space/Space.jl @@ -334,9 +334,7 @@ Returns the sum of the cartesian coordinates of the point p=(b,r). end -@inline assign_thx() = convert(Int64, CUDA.threadIdx().x), convert(Int64, CUDA.blockIdx().x) - -export up, dw, updw, point_index, point_coord, point_time, assign_thx +export up, dw, updw, point_index, point_coord, point_time export BC_PERIODIC, BC_OPEN, BC_SF_AFWB, BC_SF_ORBI end diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index 03b68a4..f06e7ce 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -11,183 +11,191 @@ function krnl_impr!(plx, U::AbstractArray{T}, c0, c1, Ubnd::NTuple{NB,T}, cG, ztw, lp::SpaceParm{N,M,B,D}) where {T,NB,N,M,B,D} - b, r = assign_thx() + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) it = point_time((b, r), lp) - + Ush = @cuStaticSharedMem(T, (D,2)) - + ipl = 0 S = zero(eltype(plx)) - for id1 in N:-1:1 - bu1, ru1 = up((b, r), id1, lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1==N) - Ush[b,1] = U[b,id1,r] - - for id2 = 1:id1-1 - bu2, ru2 = up((b, r), id2, lp) - Ush[b,2] = U[b,id2,r] - sync_threads() - ipl = ipl + 1 + @inbounds begin + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1==N) + Ush[b,1] = U[b,id1,r] - # H2 staple - (b1, r1) = up((b,r), id1, lp) - if r1 == r - ga = Ush[b1,1] - else - ga = U[b1,id1,r1] - end - - (b2, r2) = up((b1,r1), id1, lp) - if r2 == r - gb = Ush[b2,2] - else - if SFBC && (it == lp.iL[end]-1) - gb = Ubnd[id2] + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + Ush[b,2] = U[b,id2,r] + sync_threads() + ipl = ipl + 1 + + # H2 staple + (b1, r1) = up((b,r), id1, lp) + if r1 == r + ga = Ush[b1,1] else - gb = U[b2,id2,r2] + ga = U[b1,id1,r1] end - end - - (b2, r2) = up((b1,r1), id2, lp) - if r2 == r - gc = Ush[b2,1] - else - gc = U[b2,id1,r2] - end - h2 = (ga*gb)/gc - - # H3 staple - (b1, r1) = up((b,r), id2, lp) - if r1 == r - ga = Ush[b1,2] - else - ga = U[b1,id2,r1] - end - - (b2, r2) = up((b1,r1), id2, lp) - if r2 == r - gb = Ush[b2,1] - else - gb = U[b2,id1,r2] - end - - (b2, r2) = up((b1,r1), id1, lp) - if r2 == r - gc = Ush[b2,2] - else - if SFBC && (it == lp.iL[end]) - gc = Ubnd[id2] + + (b2, r2) = up((b1,r1), id1, lp) + if r2 == r + gb = Ush[b2,2] else - gc = U[b2,id2,r2] + if SFBC && (it == lp.iL[end]-1) + gb = Ubnd[id2] + else + gb = U[b2,id2,r2] + end end - end - h3 = (ga*gb)/gc - # END staples - - if ru2 == r - gb = Ush[bu2,1] - else - gb = U[bu2,id1,ru2] - end - if ru1 == r - ga = Ush[bu1,2] - else - if SFBC && (it == lp.iL[end]) - ga = Ubnd[id2] + + (b2, r2) = up((b1,r1), id2, lp) + if r2 == r + gc = Ush[b2,1] else - ga = U[bu1,id2,ru1] + gc = U[b2,id1,r2] end + h2 = (ga*gb)/gc + + # H3 staple + (b1, r1) = up((b,r), id2, lp) + if r1 == r + ga = Ush[b1,2] + else + ga = U[b1,id2,r1] + end + + (b2, r2) = up((b1,r1), id2, lp) + if r2 == r + gb = Ush[b2,1] + else + gb = U[b2,id1,r2] + end + + (b2, r2) = up((b1,r1), id1, lp) + if r2 == r + gc = Ush[b2,2] + else + if SFBC && (it == lp.iL[end]) + gc = Ubnd[id2] + else + gc = U[b2,id2,r2] + end + end + h3 = (ga*gb)/gc + # END staples + + if ru2 == r + gb = Ush[bu2,1] + else + gb = U[bu2,id1,ru2] + end + if ru1 == r + ga = Ush[bu1,2] + else + if SFBC && (it == lp.iL[end]) + ga = Ubnd[id2] + else + ga = U[bu1,id2,ru1] + end + end + + g2 = Ush[b,2]\Ush[b,1] + + if (it == lp.iL[end]) && SFBC + S += cG*(c0*tr(g2*ga/gb) + (3*c1/2)*tr(g2*ga/h3)) + elseif (it == 1) && SFBC + S += cG*(c0*tr(g2*ga/gb) + (3*c1/2)*tr(g2*ga/h3)) + c1*tr(g2*h2/gb) + else + S += ztw[ipl]*c0*tr(g2*ga/gb) + + (ztw[ipl]^2*c1)*( tr(g2*h2/gb) + tr(g2*ga/h3)) + end + end - - g2 = Ush[b,2]\Ush[b,1] - - if (it == lp.iL[end]) && SFBC - S += cG*(c0*tr(g2*ga/gb) + (3*c1/2)*tr(g2*ga/h3)) - elseif (it == 1) && SFBC - S += cG*(c0*tr(g2*ga/gb) + (3*c1/2)*tr(g2*ga/h3)) + c1*tr(g2*h2/gb) - else - S += ztw[ipl]*c0*tr(g2*ga/gb) + - (ztw[ipl]^2*c1)*( tr(g2*h2/gb) + tr(g2*ga/h3)) - end - end + + I = point_coord((b,r), lp) + plx[I] = S end - - I = point_coord((b,r), lp) - plx[I] = S - + return nothing end function krnl_plaq!(plx, U::AbstractArray{T}, Ubnd, cG, ztw, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} + - b, r = assign_thx() - it = point_time((b, r), lp) + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + it = point_time((b, r), lp) - Ush = @cuStaticSharedMem(T, (D,2)) - - S = zero(eltype(plx)) - ipl = 0 - for id1 in N:-1:1 - bu1, ru1 = up((b, r), id1, lp) - Ush[b,1] = U[b,id1,r] + Ush = @cuStaticSharedMem(T, (D,2)) + + S = zero(eltype(plx)) + ipl = 0 + for id1 in N:-1:1 + bu1, ru1 = up((b, r), id1, lp) + Ush[b,1] = U[b,id1,r] - SFBND = ( ( (B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && - ( (it == 1) || (it == lp.iL[end])) ) && (id1 == N) + SFBND = ( ( (B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && + ( (it == 1) || (it == lp.iL[end])) ) && (id1 == N) - for id2 = 1:id1-1 - bu2, ru2 = up((b, r), id2, lp) - Ush[b,2] = U[b,id2,r] - sync_threads() - ipl = ipl + 1 - - if ru1 == r - gt1 = Ush[bu1,2] - else - if SFBND && (it == lp.iL[end]) - gt1 = Ubnd[id2] + for id2 = 1:id1-1 + bu2, ru2 = up((b, r), id2, lp) + Ush[b,2] = U[b,id2,r] + sync_threads() + ipl = ipl + 1 + + if ru1 == r + gt1 = Ush[bu1,2] else - gt1 = U[bu1,id2,ru1] + if SFBND && (it == lp.iL[end]) + gt1 = Ubnd[id2] + else + gt1 = U[bu1,id2,ru1] + end + end + if ru2 == r + gt2 = Ush[bu2,1] + else + gt2 = U[bu2,id1,ru2] + end + + if SFBND + S += cG*tr(Ush[b,1]*gt1 / (Ush[b,2]*gt2)) + else + S += ztw[ipl]*tr(Ush[b,1]*gt1 / (Ush[b,2]*gt2)) end end - if ru2 == r - gt2 = Ush[bu2,1] - else - gt2 = U[bu2,id1,ru2] - end - - if SFBND - S += cG*tr(Ush[b,1]*gt1 / (Ush[b,2]*gt2)) - else - S += ztw[ipl]*tr(Ush[b,1]*gt1 / (Ush[b,2]*gt2)) - end end - end - I = point_coord((b,r), lp) - plx[I] = S - + I = point_coord((b,r), lp) + plx[I] = S + end + return nothing end function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} - b, r = assign_thx() + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) it = point_time((b,r), lp) - + Ush = @cuStaticSharedMem(T, (D,2)) @inbounds begin id1, id2 = lp.plidx[ipl] bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) - + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == N) - + Ush[b,1] = U[b,id1,r] Ush[b,2] = U[b,id2,r] sync_threads() - + if ru2 == r gt2 = Ush[bu2,1] else @@ -205,7 +213,7 @@ function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, g1 = gt1/gt2 g2 = Ush[b,2]\Ush[b,1] - + if SFBC && (it == 1) X = cG*projalg(ztw,Ush[b,1]*g1/Ush[b,2]) @@ -227,28 +235,29 @@ function krnl_force_wilson_pln!(frc1, frc2, U::AbstractArray{T}, Ubnd, cG, ztw, frc2[bu2,id1,ru2] += projalg(ztw,g2*g1) end end - + return nothing end function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} - b, r = assign_thx() + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) it = point_time((b, r), lp) - + Ush = @cuStaticSharedMem(T, (D,2)) @inbounds begin id1, id2 = lp.plidx[ipl] bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) - + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == N) - + Ush[b,1] = U[b,id1,r] Ush[b,2] = U[b,id2,r] sync_threads() - + # H1 staple (b1, r1) = dw((b,r), id2, lp) if r1 == r @@ -324,7 +333,7 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, end end h3 = (ga*gb)/gc - + # H4 staple (b1, r1) = dw((b,r), id1, lp) if r1 == r @@ -361,7 +370,7 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, g1 = ga/gb g2 = Ush[b,2]\Ush[b,1] - + if SFBC && (it == 1) X = (cG*c0)*projalg(Ush[b,1]*g1/Ush[b,2]) + c1*projalg(Ush[b,1]*h2/(Ush[b,2]*gb)) + (3*c1*cG/2)*projalg(Ush[b,1]*ga/(Ush[b,2]*h3)) @@ -396,7 +405,7 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG, frc2[bu2,id1,ru2] += projalg(c0*ztw[ipl],g2*g1) + projalg(zsq*c1,(Ush[b,2]\h1)*g1) + projalg(zsq*c1,g2*h2/gb) + projalg(zsq*c1,h4\Ush[b,1]*g1) end - + end return nothing diff --git a/src/YM/YMfields.jl b/src/YM/YMfields.jl index 63a11a8..c6e5433 100644 --- a/src/YM/YMfields.jl +++ b/src/YM/YMfields.jl @@ -36,45 +36,56 @@ end function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {T,N,M,D} - b, r = assign_thx() - for id in 1:lp.ndim - frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], - m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], - m[b,id,7,r], m[b,id,8,r]) + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + for id in 1:lp.ndim + frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], + m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], + m[b,id,7,r], m[b,id,8,r]) + end end + return nothing end function krnl_assign_SU3!(frc::AbstractArray{T}, m, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} - b, r = assign_thx() - it = point_time((b,r), lp) + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + it = point_time((b,r), lp) - if ((B==BC_SF_AFWB)||(B==BC_SF_ORBI)) - if it == 1 - for id in 1:lp.ndim-1 - frc[b,id,r] = zero(T) - end - frc[b,N,r] = SU3alg(m[b,N,1,r], m[b,N,2,r], m[b,N,3,r], - m[b,N,4,r], m[b,N,5,r], m[b,N,6,r], - m[b,N,7,r], m[b,N,8,r]) - else - for id in 1:lp.ndim - frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], - m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], - m[b,id,7,r], m[b,id,8,r]) + if ((B==BC_SF_AFWB)||(B==BC_SF_ORBI)) + if it == 1 + for id in 1:lp.ndim-1 + frc[b,id,r] = zero(T) + end + frc[b,N,r] = SU3alg(m[b,N,1,r], m[b,N,2,r], m[b,N,3,r], + m[b,N,4,r], m[b,N,5,r], m[b,N,6,r], + m[b,N,7,r], m[b,N,8,r]) + else + for id in 1:lp.ndim + frc[b,id,r] = SU3alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r], + m[b,id,4,r], m[b,id,5,r], m[b,id,6,r], + m[b,id,7,r], m[b,id,8,r]) + end end end end - + return nothing end function krnl_assign_SU2!(frc, m, lp::SpaceParm{N,M,BC_PERIODIC,D}) where {N,M,D} - b, r = assign_thx() - for id in 1:lp.ndim - frc[b,id,r] = SU2alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r]) + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + for id in 1:lp.ndim + frc[b,id,r] = SU2alg(m[b,id,1,r], m[b,id,2,r], m[b,id,3,r]) + end end + return nothing end diff --git a/src/YM/YMflow.jl b/src/YM/YMflow.jl index 58a1043..3d5aa40 100644 --- a/src/YM/YMflow.jl +++ b/src/YM/YMflow.jl @@ -89,46 +89,49 @@ end function krnl_add_zth!(frc, frc2::AbstractArray{TA}, U::AbstractArray{TG}, lp::SpaceParm{N,M,B,D}) where {TA,TG,N,M,B,D} - b, r = assign_thx() - it = point_time((b, r), lp) + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + it = point_time((b, r), lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) - Ush = @cuStaticSharedMem(TG, D) - Fsh = @cuStaticSharedMem(TA, D) - - @inbounds for id in 1:N - Ush[b] = U[b,id,r] - Fsh[b] = frc[b,id,r] - sync_threads() + Ush = @cuStaticSharedMem(TG, D) + Fsh = @cuStaticSharedMem(TA, D) - bu, ru = up((b,r), id, lp) - bd, rd = dw((b,r), id, lp) - - if ru == r - X = Fsh[bu] - else - X = frc[bu,id,ru] - end - if rd == r - Y = Fsh[bd] - Ud = Ush[bd] - else - Y = frc[bd,id,rd] - Ud = U[bd,id,rd] - end - - if SFBC - if (it > 1) && (it < lp.iL[end]) - frc2[b,id,r] = (5/6)*Fsh[b] + (1/6)*(projalg(Ud\Y*Ud) + - projalg(Ush[b]*X/Ush[b])) - elseif (it == lp.iL[end]) && (id < N) - frc2[b,id,r] = (5/6)*Fsh[b] + (1/6)*(projalg(Ud\Y*Ud) + - projalg(Ush[b]*X/Ush[b])) + @inbounds for id in 1:N + Ush[b] = U[b,id,r] + Fsh[b] = frc[b,id,r] + sync_threads() + + bu, ru = up((b,r), id, lp) + bd, rd = dw((b,r), id, lp) + + if ru == r + X = Fsh[bu] + else + X = frc[bu,id,ru] + end + if rd == r + Y = Fsh[bd] + Ud = Ush[bd] + else + Y = frc[bd,id,rd] + Ud = U[bd,id,rd] + end + + if SFBC + if (it > 1) && (it < lp.iL[end]) + frc2[b,id,r] = (5/6)*Fsh[b] + (1/6)*(projalg(Ud\Y*Ud) + + projalg(Ush[b]*X/Ush[b])) + elseif (it == lp.iL[end]) && (id < N) + frc2[b,id,r] = (5/6)*Fsh[b] + (1/6)*(projalg(Ud\Y*Ud) + + projalg(Ush[b]*X/Ush[b])) + end + else + frc2[b,id,r] = (5/6)*Fsh[b] + (1/6)*(projalg(Ud\Y*Ud) + + projalg(Ush[b]*X/Ush[b])) end - else - frc2[b,id,r] = (5/6)*Fsh[b] + (1/6)*(projalg(Ud\Y*Ud) + - projalg(Ush[b]*X/Ush[b])) end end @@ -261,23 +264,26 @@ Eoft_plaq(U, gp::GaugeParm{T,G,NN}, lp::SpaceParm{N,M,B,D}, ymws::YMworkspace) w function krnl_plaq_pln!(plx, U::AbstractArray{T}, Ubnd, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} - b, r = assign_thx() + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) - id1, id2 = lp.plidx[ipl] - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI)) && (id1 == lp.iL[end]) + id1, id2 = lp.plidx[ipl] + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI)) && (id1 == lp.iL[end]) - bu1, ru1 = up((b, r), id1, lp) - bu2, ru2 = up((b, r), id2, lp) - - if SFBC && (ru1 != r) - gt = Ubnd[id2] - else - gt = U[bu1,id2,ru1] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + + if SFBC && (ru1 != r) + gt = Ubnd[id2] + else + gt = U[bu1,id2,ru1] + end + + I = point_coord((b,r), lp) + plx[I] = ztw*tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2])) end - - I = point_coord((b,r), lp) - plx[I] = ztw*tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2])) - + return nothing end @@ -387,116 +393,126 @@ Eoft_clover(U, gp::GaugeParm, lp::SpaceParm{N,M,B,D}, ymws::YMworkspace{T}) wher function krnl_add_et!(rm, op, frc1, U, lp::SpaceParm{4,M,B,D}) where {M,B,D} - b, r = assign_thx() - - X1 = (frc1[b,1,r]+frc1[b,2,r]+frc1[b,3,r]+frc1[b,4,r]) - - I = point_coord((b,r), lp) - rm[I] = dot(X1,X1) + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + X1 = (frc1[b,1,r]+frc1[b,2,r]+frc1[b,3,r]+frc1[b,4,r]) + + I = point_coord((b,r), lp) + rm[I] = dot(X1,X1) + end + return nothing end function krnl_add_qd!(rm, op, frc1, frc2, U, lp::SpaceParm{4,M,B,D}) where {M,B,D} - b, r = assign_thx() + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + + I = point_coord((b,r), lp) + rm[I] += op(dot( (frc1[b,1,r]+frc1[b,2,r]+frc1[b,3,r]+frc1[b,4,r]), + (frc2[b,1,r]+frc2[b,2,r]+frc2[b,3,r]+frc2[b,4,r]) ) ) + end - I = point_coord((b,r), lp) - rm[I] += op(dot( (frc1[b,1,r]+frc1[b,2,r]+frc1[b,3,r]+frc1[b,4,r]), - (frc2[b,1,r]+frc2[b,2,r]+frc2[b,3,r]+frc2[b,4,r]) ) ) return nothing end function krnl_field_tensor!(frc1::AbstractArray{TA}, frc2, U::AbstractArray{T}, Ubnd, ipl1, ipl2, ztw1, ztw2, lp::SpaceParm{4,M,B,D}) where {TA,T,M,B,D} - b, r = assign_thx() - it = point_time((b,r), lp) - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) + @inbounds begin + b = Int64(CUDA.threadIdx().x) + r = Int64(CUDA.blockIdx().x) + it = point_time((b,r), lp) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) - Ush = @cuStaticSharedMem(T, (D,2)) + Ush = @cuStaticSharedMem(T, (D,2)) - #First plane - id1, id2 = lp.plidx[ipl1] - Ush[b,1] = U[b,id1,r] - Ush[b,2] = U[b,id2,r] - sync_threads() + #First plane + id1, id2 = lp.plidx[ipl1] + Ush[b,1] = U[b,id1,r] + Ush[b,2] = U[b,id2,r] + sync_threads() - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) - bu1, ru1 = up((b, r), id1, lp) - bu2, ru2 = up((b, r), id2, lp) - bd, rd = up((bu1, ru1), id2, lp) - if ru1 == r - gt1 = Ush[bu1,2] - else - if SFBC && (it == lp.iL[end]) - gt1 = Ubnd[id2] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + bd, rd = up((bu1, ru1), id2, lp) + if ru1 == r + gt1 = Ush[bu1,2] else - gt1 = U[bu1,id2,ru1] + if SFBC && (it == lp.iL[end]) + gt1 = Ubnd[id2] + else + gt1 = U[bu1,id2,ru1] + end + end + if ru2 == r + gt2 = Ush[bu2,1] + else + gt2 = U[bu2,id1,ru2] + end + + l1 = gt1/gt2 + l2 = Ush[b,2]\Ush[b,1] + + if SFBC && (it == lp.iL[end]) + frc1[b,1,r] = projalg(Ush[b,1]*l1/Ush[b,2]) + frc1[bu1,2,ru1] = zero(TA) + frc1[bd,3,rd] = zero(TA) + frc1[bu2,4,ru2] = projalg(l2*l1) + else + frc1[b,1,r] = projalg(ztw1, Ush[b,1]*l1/Ush[b,2]) + frc1[bu1,2,ru1] = projalg(ztw1, l1*l2) + frc1[bd,3,rd] = projalg(ztw1, gt2\(l2*gt1)) + frc1[bu2,4,ru2] = projalg(ztw1, l2*l1) + end + + # Second plane + id1, id2 = lp.plidx[ipl2] + Ush[b,1] = U[b,id1,r] + Ush[b,2] = U[b,id2,r] + sync_threads() + + SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) + + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + bd, rd = up((bu1, ru1), id2, lp) + if ru1 == r + gt1 = Ush[bu1,2] + else + if SFBC && (it == lp.iL[end]) + gt1 = Ubnd[id2] + else + gt1 = U[bu1,id2,ru1] + end + end + if ru2 == r + gt2 = Ush[bu2,1] + else + gt2 = U[bu2,id1,ru2] + end + + l1 = gt1/gt2 + l2 = Ush[b,2]\Ush[b,1] + + if SFBC && (it == lp.iL[end]) + frc2[b,1,r] = projalg(Ush[b,1]*l1/Ush[b,2]) + frc2[bu1,2,ru1] = zero(TA) + frc2[bd,3,rd] = zero(TA) + frc2[bu2,4,ru2] = projalg(l2*l1) + else + frc2[b,1,r] = projalg(ztw2, Ush[b,1]*l1/Ush[b,2]) + frc2[bu1,2,ru1] = projalg(ztw2, l1*l2) + frc2[bd,3,rd] = projalg(ztw2, gt2\(l2*gt1)) + frc2[bu2,4,ru2] = projalg(ztw2, l2*l1) end end - if ru2 == r - gt2 = Ush[bu2,1] - else - gt2 = U[bu2,id1,ru2] - end - - l1 = gt1/gt2 - l2 = Ush[b,2]\Ush[b,1] - - if SFBC && (it == lp.iL[end]) - frc1[b,1,r] = projalg(Ush[b,1]*l1/Ush[b,2]) - frc1[bu1,2,ru1] = zero(TA) - frc1[bd,3,rd] = zero(TA) - frc1[bu2,4,ru2] = projalg(l2*l1) - else - frc1[b,1,r] = projalg(ztw1, Ush[b,1]*l1/Ush[b,2]) - frc1[bu1,2,ru1] = projalg(ztw1, l1*l2) - frc1[bd,3,rd] = projalg(ztw1, gt2\(l2*gt1)) - frc1[bu2,4,ru2] = projalg(ztw1, l2*l1) - end - - # Second plane - id1, id2 = lp.plidx[ipl2] - Ush[b,1] = U[b,id1,r] - Ush[b,2] = U[b,id2,r] - sync_threads() - - SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI) ) && (id1 == 4) - - bu1, ru1 = up((b, r), id1, lp) - bu2, ru2 = up((b, r), id2, lp) - bd, rd = up((bu1, ru1), id2, lp) - if ru1 == r - gt1 = Ush[bu1,2] - else - if SFBC && (it == lp.iL[end]) - gt1 = Ubnd[id2] - else - gt1 = U[bu1,id2,ru1] - end - end - if ru2 == r - gt2 = Ush[bu2,1] - else - gt2 = U[bu2,id1,ru2] - end - - l1 = gt1/gt2 - l2 = Ush[b,2]\Ush[b,1] - - if SFBC && (it == lp.iL[end]) - frc2[b,1,r] = projalg(Ush[b,1]*l1/Ush[b,2]) - frc2[bu1,2,ru1] = zero(TA) - frc2[bd,3,rd] = zero(TA) - frc2[bu2,4,ru2] = projalg(l2*l1) - else - frc2[b,1,r] = projalg(ztw2, Ush[b,1]*l1/Ush[b,2]) - frc2[bu1,2,ru1] = projalg(ztw2, l1*l2) - frc2[bd,3,rd] = projalg(ztw2, gt2\(l2*gt1)) - frc2[bu2,4,ru2] = projalg(ztw2, l2*l1) - end - + return nothing end diff --git a/src/YM/YMsf.jl b/src/YM/YMsf.jl index 2a3e3c5..f882536 100644 --- a/src/YM/YMsf.jl +++ b/src/YM/YMsf.jl @@ -44,35 +44,37 @@ end function krnl_sfcoupling!(rm, U::AbstractArray{T}, Ubnd, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D} - b, r = assign_thx() - I = point_coord((b,r), lp) - it = I[N] + @inbounds begin + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + I = point_coord((b,r), lp) + it = I[N] - SR3::eltype(rm) = 1.73205080756887729352744634151 - SR3x2::eltype(rm) = 3.46410161513775458705489268302 - - if (it == 1) - but, rut = up((b,r), N, lp) - IU = point_coord((but,rut), lp) - for id in 1:N-1 - bu, ru = up((b,r), id, lp) + SR3::eltype(rm) = 1.73205080756887729352744634151 + SR3x2::eltype(rm) = 3.46410161513775458705489268302 + + if (it == 1) + but, rut = up((b,r), N, lp) + IU = point_coord((but,rut), lp) + for id in 1:N-1 + bu, ru = up((b,r), id, lp) - X = projalg(U[b,id,r]*U[bu,N,ru]/(U[b,N,r]*U[but,id,rut])) - rm[I] += (3*X.t7 + SR3 * X.t8)/lp.iL[id] - rm[IU] += (2*X.t7 - SR3x2 * X.t8)/lp.iL[id] - end - elseif (it == lp.iL[end]) - bdt, rdt = dw((b,r), N, lp) - ID = point_coord((bdt,rdt), lp) - for id in 1:N-1 - bu, ru = up((b,r), id, lp) + X = projalg(U[b,id,r]*U[bu,N,ru]/(U[b,N,r]*U[but,id,rut])) + rm[I] += (3*X.t7 + SR3 * X.t8)/lp.iL[id] + rm[IU] += (2*X.t7 - SR3x2 * X.t8)/lp.iL[id] + end + elseif (it == lp.iL[end]) + bdt, rdt = dw((b,r), N, lp) + ID = point_coord((bdt,rdt), lp) + for id in 1:N-1 + bu, ru = up((b,r), id, lp) - X = projalg(Ubnd[id]/(U[b,id,r]*U[bu,N,ru])*U[b,N,r]) - rm[I] -= (3*X.t7 + SR3 * X.t8)/lp.iL[id] - rm[ID] += (2*X.t7 - SR3x2 * X.t8)/lp.iL[id] + X = projalg(Ubnd[id]/(U[b,id,r]*U[bu,N,ru])*U[b,N,r]) + rm[I] -= (3*X.t7 + SR3 * X.t8)/lp.iL[id] + rm[ID] += (2*X.t7 - SR3x2 * X.t8)/lp.iL[id] + end end end - + return nothing end @@ -99,16 +101,18 @@ end function krnl_setbnd_it0!(U, phi1, phi2, lp::SpaceParm{N,M,B,D}) where {N,M,B,D} - b, r = assign_thx() - it = point_time((b,r), lp) + @inbounds begin + b = Int64(CUDA.threadIdx().x); r = Int64(CUDA.blockIdx().x) + it = point_time((b,r), lp) - SFBC = (B == BC_SF_AFWB) || (B == BC_SF_ORBI) - - if (it == 0) && SFBC - for id in 1:N-1 - U[b,id,r] = bndfield(phi1,phi2,lp.iL[id]) + SFBC = (B == BC_SF_AFWB) || (B == BC_SF_ORBI) + + if (it == 0) && SFBC + for id in 1:N-1 + U[b,id,r] = bndfield(phi1,phi2,lp.iL[id]) + end end end - + return nothing end