diff --git a/src/Space/Space.jl b/src/Space/Space.jl index 1791eb5..71bc94b 100644 --- a/src/Space/Space.jl +++ b/src/Space/Space.jl @@ -14,7 +14,7 @@ module Space import Base.show -struct SpaceParm{N,M} +struct SpaceParm{N,M,D} ndim::Int64 iL::NTuple{N,Int64} npls::Int64 @@ -50,7 +50,8 @@ struct SpaceParm{N,M} end end - return new{N,M}(N, x, M, tuple(pls...), y, + D = prod(y) + return new{N,M,D}(N, x, M, tuple(pls...), y, tuple(yS...), tuple(r...), tuple(rS...), prod(y), prod(r)) end end diff --git a/src/YM/YM.jl b/src/YM/YM.jl index 72b4851..30749c4 100644 --- a/src/YM/YM.jl +++ b/src/YM/YM.jl @@ -29,6 +29,7 @@ struct YMworkspace{T} PRC frc1 frc2 + fpln mom U1 cm # complex of volume @@ -38,24 +39,26 @@ struct YMworkspace{T} if (G == SU2) GRP = SU2 ALG = SU2alg - f1 = field(SU2alg, T, lp) - f2 = field(SU2alg, T, lp) - mm = field(SU2alg, T, lp) - u1 = field(SU2, T, lp) + f1 = field(SU2alg{T}, lp) + f2 = field(SU2alg{T}, lp) + fp = field_pln(SU2alg{T}, lp) + mm = field(SU2alg{T}, lp) + u1 = field(SU2{T}, lp) end if (G == SU3) GRP = SU3 ALG = SU3alg - f1 = field(SU3alg, T, lp) - f2 = field(SU3alg, T, lp) - mm = field(SU3alg, T, lp) - u1 = field(SU3, T, lp) + f1 = field(SU3alg{T}, lp) + f2 = field(SU3alg{T}, lp) + fp = field_pln(SU3alg{T}, lp) + mm = field(SU3alg{T}, lp) + u1 = field(SU3{T}, lp) end cs = CuArray{Complex{T}, 2}(undef, lp.bsz,lp.rsz) rs = CuArray{T, 2}(undef, lp.bsz,lp.rsz) - return new{T}(GRP,ALG,T,f1, f2, mm, u1, cs, rs) + return new{T}(GRP,ALG,T,f1, f2, fp, mm, u1, cs, rs) end end export YMworkspace diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index 257e073..ad3e4b3 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -62,34 +62,65 @@ function krnl_force_wilson_pln!(frc1, frc2, U, ipl, lp::SpaceParm) return nothing end -function krnl_force_wilson_nw!(fpl, U, lp::SpaceParm) +function krnl_force_wilson_nw!(fpl, U::AbstractArray{T}, lp::SpaceParm{N,M,D}) where {T,N,M,D} - b = CUDA.threadIdx().x - ipl = mod1(CUDA.blockIdx().x, lp.npls) - r = div(CUDA.blockIdx().x-1, lp.npls)+1 + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x - @inbounds begin #for ipl in 1:lp.npls + Ush = @cuStaticSharedMem(T, (D,N)) + + for id in 1:N + Ush[b,id] = U[b,id,r] + end + sync_threads() + + @inbounds for ipl in 1:lp.npls id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) bu2, ru2 = up((b, r), id2, lp) + + if ru2 == r + gt2 = Ush[bu2,id1] + else + gt2 = U[bu2,id1,ru2] + end + if ru1 == r + gt1 = Ush[bu1,id2] + else + gt1 = U[bu1,id2,ru1] + end - g1 = U[bu1,id2,ru1]/U[bu2,id1,ru2] - g2 = U[b,id2,r]\U[b,id1,r] + g1 = gt1/gt2 + g2 = Ush[b,id2]\Ush[b,id1] - F1 = projalg(U[b,id1,r]*g1/U[b,id2,r]) - F2 = projalg(g1*g2) - F3 = projalg(g2*g1) + fpl[b ,ipl, r ,1] = projalg(Ush[b,id1]*g1/Ush[b,id2]) + fpl[bu1,ipl, ru1,2] = projalg(g1*g2) + fpl[bu2,ipl, ru2,3] = projalg(g2*g1) - fpl[b ,ipl, r ,1,1] = -F1 - fpl[b ,ipl, r ,2,1] = F1 - fpl[bu1,ipl, ru1,2,2] = -F2 - fpl[bu2,ipl, ru2,1,2] = F3 end return nothing end +function krnl_add_force_plns!(frc::AbstractArray{T}, fpl, lp::SpaceParm{N,M,D}) where {T,N,M,D} + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + Fsh = @cuStaticSharedMem(T, (D,M)) + @inbounds for j in 1:M + Fsh[b,j] = fpl[b,j,r,1] + end + sync_threads() + + @inbounds for ipl in 1:lp.npls + id1, id2 = lp.plidx[ipl] + + frc[b,id1,r] += -Fsh[b,ipl] + fpl[b,ipl,r,3] + frc[b,id2,r] += Fsh[b,ipl] - fpl[b,ipl,r,2] + end + + return nothing +end function krnl_force_wilson!(frc, U, lp::SpaceParm) @@ -114,18 +145,6 @@ function krnl_force_wilson!(frc, U, lp::SpaceParm) return nothing end -function krnl_add_force_plns!(frc, fpl, lp::SpaceParm) - - b, r = CUDA.threadIdx().x, CUDA.blockIdx().x - @inbounds for ipl in 1:lp.npls - id1, id2 = lp.plidx[ipl] - frc[b,id1,r] += fpl[b,ipl,r,1,1] + fpl[b,ipl,r,1,2] - frc[b,id2,r] += fpl[b,ipl,r,2,1] + fpl[b,ipl,r,2,2] - end - - return nothing -end - function force0_wilson_pln!(frc1, ftmp, U, lp::SpaceParm) fill!(frc1, zero(eltype(frc1))) @@ -153,7 +172,7 @@ end function force0_wilson_nw!(frc1, fpln, U, lp::SpaceParm) CUDA.@sync begin - CUDA.@cuda threads=lp.bsz blocks=lp.rsz*lp.npls krnl_force_wilson_nw!(fpln,U,lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_force_wilson_nw!(fpln,U,lp) end fill!(frc1, zero(eltype(frc1))) CUDA.@sync begin diff --git a/src/YM/YMfields.jl b/src/YM/YMfields.jl index 221cd6a..6f5cbfa 100644 --- a/src/YM/YMfields.jl +++ b/src/YM/YMfields.jl @@ -11,27 +11,16 @@ un(t) = t <: Union{Group, Complex} -function field(::Type{G}, ::Type{T}, lp::SpaceParm) where {G <: Union{Group, Algebra}, T <: AbstractFloat} +function field(::Type{T}, lp::SpaceParm) where {T} sz = lp.bsz, lp.ndim, lp.rsz - if (G == SU2) -# As = StructArray{SU2}(undef, sz, unwrap=un) - return CuArray{SU2{T}, 3}(undef, sz) - elseif (G == SU2alg) -# As = StructArray{SU2alg}(undef, sz, unwrap=un) - return CuArray{SU2alg{T}, 3}(undef, sz) - elseif (G == SU3) -# As = StructArray{SU3}(undef, sz, unwrap=un) - return CuArray{SU3{T}, 3}(undef, sz) -# As = Array{SU3, lp.ndim+1}(undef, sz) - elseif (G == SU3alg) - # As = StructArray{SU3alg}(undef, sz, unwrap=un) - return CuArray{SU3alg{T}, 3}(undef, sz) -# As = Array{SU3alg, lp.ndim+1}(undef, sz) - end - - return replace_storage(CuArray, As) + return CuArray{T, 3}(undef, sz) +end +function field_pln(::Type{T}, lp::SpaceParm) where {T} + + sz = lp.bsz, lp.npls, lp.rsz, 3 + return CuArray{T, 4}(undef, sz) end function randomize!(f, lp::SpaceParm, ymws::YMworkspace) diff --git a/src/YM/YMhmc.jl b/src/YM/YMhmc.jl index 07f038a..1d0a77e 100644 --- a/src/YM/YMhmc.jl +++ b/src/YM/YMhmc.jl @@ -22,11 +22,9 @@ end function plaquette(U, lp::SpaceParm, gp::GaugeParm, ymws::YMworkspace) - println("Entro!") CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_plaq!(ymws.cm, U, lp) end - println("Salgo!") return CUDA.mapreduce(real, +, ymws.cm)/(prod(lp.iL)*lp.npls) end @@ -34,7 +32,7 @@ end function hamiltonian(mom, U, lp, gp, ymws) K = CUDA.mapreduce(norm2, +, mom)/2 V = gauge_action(U, lp, gp, ymws) - println("K: ", K, " V: ", V) + return K+V end @@ -66,13 +64,19 @@ function HMC!(U, eps, ns, lp::SpaceParm, gp::GaugeParm, ymws::YMworkspace; noacc return dh, acc end -function krnl_updt!(mom, frc1, frc2, eps1, U, eps2, lp::SpaceParm) +function krnl_updt!(mom::AbstractArray{TF}, frc, eps1, U::AbstractArray{TU}, eps2, lp::SpaceParm{N,M,D}) where {TU,TF, N,M,D} b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + Ush = @cuStaticSharedMem(TU, D) + Fsh = @cuStaticSharedMem(TF, D) + @inbounds for id in 1:lp.ndim - mom[b,id,r] = mom[b,id,r] + eps1 * (frc1[b,id,r]+frc2[b,id,r]) - U[b,id,r] = expm(U[b,id,r], mom[b,id,r], eps2) + Ush[b] = U[b,id,r] + Fsh[b] = frc[b,id,r] + + mom[b,id,r] = mom[b,id,r] + eps1 * Fsh[b] + U[b,id,r] = expm(Ush[b], mom[b,id,r], eps2) end return nothing @@ -87,76 +91,50 @@ function OMF4!(mom, U, eps, ns, lp::SpaceParm, gp::GaugeParm{T}, ymws::YMworkspa r5::T = 0.5-r1-r3 r6::T = 1.0-2.0*(r2+r4) -# ee = eps*gp.beta/gp.ng -# @device_code_warntype force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp) -# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) -# zero!(ymws.frc2, lp) -# for i in 1:ns -# # STEP 1 -# CUDA.@sync begin -# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r1*ee, U, eps*r2, lp) -# end -# -# # STEP 2 -# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) -# CUDA.@sync begin -# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r3*ee, U, eps*r4, lp) -# end -# -# # STEP 3 -# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) -# CUDA.@sync begin -# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r5*ee, U, eps*r6, lp) -# end -# -# # STEP 4 -# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) -# CUDA.@sync begin -# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r5*ee, U, eps*r4, lp) -# end -# -# # STEP 5 -# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) -# CUDA.@sync begin -# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r3*ee, U, eps*r2, lp) -# end -# -# # STEP 6 -# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) -# CUDA.@sync begin -# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r1*ee, U, 0.0, lp) -# end -# end - ee = eps*gp.beta/gp.ng - force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) + force0_wilson_nw!(ymws.frc1, ymws.fpln, U, lp) for i in 1:ns # STEP 1 +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1, r1*ee, U, eps*r2, lp) +# end mom .= mom .+ (r1*ee) .* ymws.frc1 U .= expm.(U, mom, eps*r2) # STEP 2 - force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) + force0_wilson_nw!(ymws.frc1, ymws.fpln, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1, r3*ee, U, eps*r4, lp) +# end mom .= mom .+ (r3*ee) .* ymws.frc1 U .= expm.(U, mom, eps*r4) # STEP 3 - force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) + force0_wilson_nw!(ymws.frc1, ymws.fpln, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1, r5*ee, U, eps*r6, lp) +# end mom .= mom .+ (r5*ee) .* ymws.frc1 U .= expm.(U, mom, eps*r6) # STEP 4 - force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) + force0_wilson_nw!(ymws.frc1, ymws.fpln, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1, r5*ee, U, eps*r4, lp) +# end mom .= mom .+ (r5*ee) .* ymws.frc1 U .= expm.(U, mom, eps*r4) # STEP 5 - force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) + force0_wilson_nw!(ymws.frc1, ymws.fpln, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1, r3*ee, U, eps*r2, lp) +# end mom .= mom .+ (r3*ee) .* ymws.frc1 U .= expm.(U, mom, eps*r2) # STEP 6 - force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) + force0_wilson_nw!(ymws.frc1, ymws.fpln, U, lp) mom .= mom .+ (r1*ee) .* ymws.frc1 end