diff --git a/src/Groups/GroupSU2.jl b/src/Groups/GroupSU2.jl index 7c678a2..9ace061 100644 --- a/src/Groups/GroupSU2.jl +++ b/src/Groups/GroupSU2.jl @@ -68,6 +68,14 @@ Base.:*(a::SU2alg,b::Number) = SU2alg(a.t1*b,a.t2*b,a.t3*b) Base.:*(b::Number,a::SU2alg) = SU2alg(a.t1*b,a.t2*b,a.t3*b) Base.:/(a::SU2alg,b::Number) = SU2alg(a.t1/b,a.t2/b,a.t3/b) +function isgroup(a::SU2) + tol = 1.0E-10 + if (abs2(a.t1) + abs2(a.t2) - 1.0 < 1.0E-10) + return true + else + return false + end +end """ function Base.exp(a::SU2alg, t::Number=1) @@ -155,3 +163,5 @@ function expm(g::SU2, a::SU2alg, t::Float64) return SU2(t1,t2) end + +export SU2, SU2alg, inverse, dag, tr, projalg, expm, exp, norm, norm2, isgroup diff --git a/src/Groups/GroupSU3.jl b/src/Groups/GroupSU3.jl index f90f34f..32aa631 100644 --- a/src/Groups/GroupSU3.jl +++ b/src/Groups/GroupSU3.jl @@ -79,8 +79,21 @@ function Base.:\(a::SU3,b::SU3) end +function isgroup(a::SU3) - + tol = 1.0E-10 + g = a/a + if ( (abs(g.u11 - 1.0) < tol) && + (abs(g.u12) < tol) && + (abs(g.u13) < tol) && + (abs(g.u21) < tol) && + (abs(g.u22 - 1.0) < tol) && + (abs(g.u23) < tol) ) + return true + else + return false + end +end struct SU3alg <: Algebra t1::Float64 @@ -122,7 +135,7 @@ Base.:/(a::SU3alg,b::Number) = SU3alg(a.t1/b,a.t2/b,a.t3/b,a.t4/b,a.t5/b,a.t6/b, -export SU3, SU3alg, inverse, dag, tr, projalg, expm, exp, norm, norm2 +export SU3, SU3alg, inverse, dag, tr, projalg, expm, exp, norm, norm2, isgroup struct M3x3 u11::ComplexF64 diff --git a/src/Groups/Groups.jl b/src/Groups/Groups.jl index 8e4ed9a..3bcb330 100644 --- a/src/Groups/Groups.jl +++ b/src/Groups/Groups.jl @@ -23,7 +23,7 @@ export SU2, SU2alg include("GroupSU3.jl") export SU3, SU3alg -export dot, expm, exp, dag, normalize, inverse, tr, projalg, norm, norm2 +export dot, expm, exp, dag, normalize, inverse, tr, projalg, norm, norm2, isgroup end # module diff --git a/src/LatticeGPU.jl b/src/LatticeGPU.jl index 0ea52ea..67553e9 100644 --- a/src/LatticeGPU.jl +++ b/src/LatticeGPU.jl @@ -17,19 +17,19 @@ include("Groups/Groups.jl") using .Groups export Group, Algebra export SU2, SU2alg, SU3, SU3alg -export dot, expm, exp, dag, normalize, inverse, tr, projalg, norm, norm2 +export dot, expm, exp, dag, normalize, inverse, tr, projalg, norm, norm2, isgroup include("Space/Space.jl") using .Space -export SpaceParm, KernelParm -export map2latt, up, dw, shift +export SpaceParm +export up, dw, updw include("YM/YM.jl") using .YM -export YMworkspace, GaugeParm, force0_wilson!, field, randomn!, zero!, norm2 +export YMworkspace, GaugeParm, force0_wilson!, field, field_pln, randomn!, zero!, norm2 export gauge_action, hamiltonian, plaquette, HMC!, OMF4! end # module diff --git a/src/Space/Space.jl b/src/Space/Space.jl index 7da0e1b..9efc560 100644 --- a/src/Space/Space.jl +++ b/src/Space/Space.jl @@ -16,11 +16,20 @@ struct SpaceParm{N,M} ndim::Int64 iL::NTuple{N,Int64} npls::Int64 - plidx::NTuple{M, Tuple{Int64, Int64}} + plidx::NTuple{M,Tuple{Int64, Int64}} - function SpaceParm{N}(x) where {N} + blk::NTuple{N,Int64} + blkS::NTuple{N,Int64} + rbk::NTuple{N,Int64} + rbkS::NTuple{N,Int64} + + bsz::Int64 + rsz::Int64 + + function SpaceParm{N}(x, y) where {N} M = convert(Int64, round(N*(N-1)/2)) - N == length(x) || throw(ArgumentError("Tuple of incorrect length for dimension $N")) + N == length(x) || throw(ArgumentError("Lattice size incorrect length for dimension $N")) + N == length(y) || throw(ArgumentError("Block size incorrect length for dimension $N")) pls = Vector{Tuple{Int64, Int64}}() for i in 1:N @@ -28,120 +37,100 @@ struct SpaceParm{N,M} push!(pls, (i,j)) end end - return new{N,M}(N, x, M, tuple(pls...)) + + r = div.(x, y) + rS = ones(N) + yS = ones(N) + for i in 2:N + for j in 1:i-1 + rS[i] = rS[i]*r[j] + yS[i] = yS[i]*y[j] + end + end + + return new{N,M}(N, x, M, tuple(pls...), y, + tuple(yS...), tuple(r...), tuple(rS...), prod(y), prod(r)) end end export SpaceParm -struct KernelParm - threads::Tuple{Int64,Int64,Int64} - blocks::Tuple{Int64,Int64,Int64} -end -export KernelParm +@inline function up(p::NTuple{2,Int64}, id::Int64, lp::SpaceParm) -@inline shift(p::CartesianIndex{4}, sh::CartesianIndex{4}, lp::SpaceParm{4}) = CartesianIndex(mod1(p[1]+sh[1], lp.iL[1]), - mod1(p[2]+sh[2], lp.iL[2]), - mod1(p[3]+sh[3], lp.iL[3]), - mod1(p[4]+sh[4], lp.iL[4])) -@inline shift(p::CartesianIndex{3}, sh::CartesianIndex{3}, lp::SpaceParm{3}) = CartesianIndex(mod1(p[1]+sh[1], lp.iL[1]), - mod1(p[2]+sh[2], lp.iL[2]), - mod1(p[3]+sh[3], lp.iL[3])) -@inline shift(p::CartesianIndex{2}, sh::CartesianIndex{2}, lp::SpaceParm{2}) = CartesianIndex(mod1(p[1]+sh[1], lp.iL[1]), - mod1(p[2]+sh[2], lp.iL[2])) -@inline function dw(p::CartesianIndex{4}, id, lp::SpaceParm{4}) + ic = mod(div(p[1]-1,lp.blkS[id]),lp.blk[id]) + if (ic == lp.blk[id]-1) + b = p[1] - (lp.blk[id]-1)*lp.blkS[id] - if (id == 1) - s = CartesianIndex(mod1(p[1]-1, lp.iL[1]), p[2], p[3], p[4]) - elseif (id == 2) - s = CartesianIndex(p[1], mod1(p[2]-1, lp.iL[2]), p[3], p[4]) - elseif (id == 3) - s = CartesianIndex(p[1], p[2], mod1(p[3]-1, lp.iL[3]), p[4]) - elseif (id == 4) - s = CartesianIndex(p[1], p[2], p[3], mod1(p[4]-1, lp.iL[4])) - end - - return s -end - -@inline function dw(p::CartesianIndex{3}, id, lp::SpaceParm{3}) - - if (id == 1) - s = CartesianIndex(mod1(p[1]-1, lp.iL[1]), p[2], p[3]) - elseif (id == 2) - s = CartesianIndex(p[1], mod1(p[2]-1, lp.iL[2]), p[3]) - elseif (id == 3) - s = CartesianIndex(p[1], p[2], mod1(p[3]-1, lp.iL[3])) - end - - return s -end - -@inline function dw(p::CartesianIndex{2}, id, lp::SpaceParm{2}) - - if (id == 1) - s = CartesianIndex(mod1(p[1]-1, lp.iL[1]), p[2]) - elseif (id == 2) - s = CartesianIndex(p[1], mod1(p[2]-1, lp.iL[2])) - end - - return s -end - -@inline function up(p::CartesianIndex{4}, id::Int64, lp::SpaceParm{4}) - - if (id == 1) - s = CartesianIndex(mod1(p[1]+1, lp.iL[1]), p[2], p[3], p[4]) - elseif (id == 2) - s = CartesianIndex(p[1], mod1(p[2]+1, lp.iL[2]), p[3], p[4]) - elseif (id == 3) - s = CartesianIndex(p[1], p[2], mod1(p[3]+1, lp.iL[3]), p[4]) - elseif (id == 4) - s = CartesianIndex(p[1], p[2], p[3], mod1(p[4]+1, lp.iL[4])) + ic = mod(div(p[2]-1,lp.rbkS[id]),lp.rbk[id]) + if (ic == lp.rbk[id]-1) + r = p[2] - (lp.rbk[id]-1)*lp.rbkS[id] + else + r = p[2] + lp.rbkS[id] + end + else + b = p[1] + lp.blkS[id] + r = p[2] end - return s + + return b, r end -@inline function up(p::CartesianIndex{3}, id::Int64, lp::SpaceParm{3}) +@inline function dw(p::NTuple{2,Int64}, id::Int64, lp::SpaceParm) - if (id == 1) - s = CartesianIndex(mod1(p[1]+1, lp.iL[1]), p[2], p[3]) - elseif (id == 2) - s = CartesianIndex(p[1], mod1(p[2]+1, lp.iL[2]), p[3]) - elseif (id == 3) - s = CartesianIndex(p[1], p[2], mod1(p[3]+1, lp.iL[3])) + ic = mod(div(p[1]-1,lp.blkS[id]),lp.blk[id]) + if (ic == 0) + b = p[1] + (lp.blk[id]-1)*lp.blkS[id] + ic = mod(div(p[2]-1,lp.rbkS[id]),lp.rbk[id]) + if (ic == 0) + r = p[2] + (lp.rbk[id]-1)*lp.rbkS[id] + else + r = p[2] - lp.rbkS[id] + end + else + b = p[1] - lp.blkS[id] + r = p[2] end - return s + + return b, r end -@inline function up(p::CartesianIndex{2}, id::Int64, lp::SpaceParm{2}) +@inline function updw(p::NTuple{2,Int64}, id::Int64, lp::SpaceParm) - if (id == 1) - s = CartesianIndex(mod1(p[1]+1, lp.iL[1]), p[2]) - elseif (id == 2) - s = CartesianIndex(p[1], mod1(p[2]+1, lp.iL[2])) + ic = mod(div(p[1]-1,lp.blkS[id]),lp.blk[id]) + if (ic == lp.blk[id]-1) + bu = p[1] - (lp.blk[id]-1)*lp.blkS[id] + bd = p[1] - lp.blkS[id] + + ic = mod(div(p[2]-1,lp.rbkS[id]),lp.rbk[id]) + if (ic == lp.rbk[id]-1) + ru = p[2] - (lp.rbk[id]-1)*lp.rbkS[id] + else + ru = p[2] + lp.rbkS[id] + end + rd = p[2] + elseif (ic == 0) + bu = p[1] + lp.blkS[id] + bd = p[1] + (lp.blk[id]-1)*lp.blkS[id] + + ic = mod(div(p[2]-1,lp.rbkS[id]),lp.rbk[id]) + if (ic == 0) + rd = p[2] + (lp.rbk[id]-1)*lp.rbkS[id] + else + rd = p[2] - lp.rbkS[id] + end + ru = p[2] + else + bu = p[1] + lp.blkS[id] + bd = p[1] - lp.blkS[id] + rd = p[2] + ru = p[2] end - return s + + return bu, ru, bd, rd end -@inline map2latt(th::NTuple{3,Int64},bl::NTuple{3,Int64}) = CartesianIndex(th[1],bl[3],bl[1],bl[2]) - - - -@inline function map2latt(th::NTuple{3,Int64},bl::NTuple{3,Int64}, lp) - - i1 = mod1(th[1], lp.lblock[1]) + mod(bl[1], div(lp.iL[1],lp.lblock[1]))*lp.lblock[1] - i2 = div(th[1]-1, lp.lblock[1]) + 1 + div(bl[1]-1, div(lp.iL[1],lp.lblock[1]))*lp.lblock[2] - i3 = (bl[2] - 1) * lp.lblock[3] + th[2] - i4 = (bl[3] - 1) * lp.lblock[4] + th[3] - - - return CartesianIndex(i1,i2,i3,i4) -end - - -export map2latt, up, dw, shift +export up, dw, updw end diff --git a/src/YM/YM.jl b/src/YM/YM.jl index 566f997..960dd7a 100644 --- a/src/YM/YM.jl +++ b/src/YM/YM.jl @@ -24,7 +24,7 @@ end export GaugeParm include("YMfields.jl") -export field, randomn!, zero!, norm2 +export field, field_pln, randomn!, zero!, norm2 struct YMworkspace frc1 @@ -32,6 +32,7 @@ struct YMworkspace mom U1 cm # complex of volume + rm # float of volume function YMworkspace(::Type{T}, lp::SpaceParm) where {T <: Union{Group,Algebra}} if (T == SU2) @@ -39,9 +40,6 @@ struct YMworkspace f2 = field(SU2alg, lp) mm = field(SU2alg, lp) u1 = field(SU2, lp) - cs = zeros(ComplexF64,lp.iL...) - rs = zeros(Float64, lp.iL...) - return new(f1, f2, mm, u1, replace_storage(CuArray, cs)) end if (T == SU3) @@ -49,11 +47,11 @@ struct YMworkspace f2 = field(SU3alg, lp) mm = field(SU3alg, lp) u1 = field(SU3, lp) - cs = zeros(ComplexF64,lp.iL...) - rs = zeros(Float64, lp.iL...) - return new(f1, f2, mm, u1, replace_storage(CuArray, cs)) end - return nothing + cs = CuArray{ComplexF64, 2}(undef, lp.bsz,lp.rsz) + rs = CuArray{Float64, 2}(undef, lp.bsz,lp.rsz) + + return new(f1, f2, mm, u1, cs, rs) end end export YMworkspace diff --git a/src/YM/YMact.jl b/src/YM/YMact.jl index 6aae768..7041427 100644 --- a/src/YM/YMact.jl +++ b/src/YM/YMact.jl @@ -12,68 +12,171 @@ function krnl_plaq!(plx, U, ipl, lp::SpaceParm) id1, id2 = lp.plidx[ipl] - X = map2latt((CUDA.threadIdx().x,CUDA.threadIdx().y,CUDA.threadIdx().z), - (CUDA.blockIdx().x,CUDA.blockIdx().y,CUDA.blockIdx().z)) - Xu1 = up(X, id1, lp) - Xu2 = up(X, id2, lp) + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) - @inbounds plx[X] = tr(U[X, id1]*U[Xu1, id2] / (U[X, id2]*U[Xu2, id1])) + @inbounds plx[b, r] = tr(U[b,r,id1]*U[bu1,ru1,id2] / (U[b,r,id2]*U[bu2,ru2,id1])) return nothing end function krnl_plaq!(plx, U, lp::SpaceParm) - X = map2latt((CUDA.threadIdx().x,CUDA.threadIdx().y,CUDA.threadIdx().z), - (CUDA.blockIdx().x,CUDA.blockIdx().y,CUDA.blockIdx().z)) + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + if (b < 1) || (b > lp.bsz) + @cuprintln("Error b: %b, %r") + end + if (r < 1) || (r > lp.rsz) + @cuprintln("Error r: %b, %r") + end - @inbounds plx[X] = complex(0.0) + plx[b,r] = complex(0.0) + for ipl in 1:lp.npls + id1, id2 = lp.plidx[ipl] + + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + if (bu1 < 1) || (bu1 > lp.bsz) + @cuprintln("Error bu1: %b, %r, %id1") + end + if (ru1 < 1) || (ru1 > lp.rsz) + @cuprintln("Error ru1: %b, %r, %id1") + end + if (bu2 < 1) || (bu2 > lp.bsz) + @cuprintln("Error bu2: %b, %r, %id2") + end + if (ru2 < 1) || (ru2 > lp.rsz) + @cuprintln("Error ru2: %b, %r, %id2") + end + + plx[b,r] += tr(U[r,id1,b]*U[bu1,id2,ru1] / (U[b,id2,r]*U[bu2,id1,ru2])) + end + + return nothing +end + +function krnl_force_wilson_pln!(frc1, frc2, U, ipl, lp::SpaceParm) + + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + @inbounds begin + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + + g1 = U[bu1,id2,ru1]/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + F1 = projalg(U[b,id1,r]*g1/U[b,id2,r]) + F2 = projalg(g1*g2) + F3 = projalg(g2*g1) + + frc1[b ,r,id1 ] -= F1 + frc1[b ,r,id2 ] += F1 + frc2[bu1,ru1,id2] -= F2 + frc2[bu2,ru2,id1] += F3 + end + + return nothing +end + +function krnl_force_wilson_nw!(fpl, U, lp::SpaceParm) + + b = CUDA.threadIdx().x + ipl = mod1(CUDA.blockIdx().x, lp.npls) + r = div(CUDA.blockIdx().x-1, lp.npls)+1 + + @inbounds begin #for ipl in 1:lp.npls + id1, id2 = lp.plidx[ipl] + bu1, ru1 = up((b, r), id1, lp) + bu2, ru2 = up((b, r), id2, lp) + + g1 = U[bu1,id2,ru1]/U[bu2,id1,ru2] + g2 = U[b,id2,r]\U[b,id1,r] + + F1 = projalg(U[b,id1,r]*g1/U[b,id2,r]) + F2 = projalg(g1*g2) + F3 = projalg(g2*g1) + + fpl[b ,ipl, r ,1,1] = -F1 + fpl[b ,ipl, r ,2,1] = F1 + fpl[bu1,ipl, ru1,2,2] = -F2 + fpl[bu2,ipl, ru2,1,2] = F3 + end + + return nothing +end + + + +function krnl_force_wilson!(frc, U, lp::SpaceParm) + + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + @inbounds for id1 in 1:lp.ndim + bu1, ru1, bd1, rd1 = updw((b, r), id1, lp) + @inbounds for id2 in id1+1:lp.ndim + bu2, ru2, bd2, rd2 = updw((b, r), id2, lp) + bud, rud = dw((bu1,ru1), id2, lp) + bdu, rdu = dw((bu2,ru2), id1, lp) + + F1 = projalg(U[b,id1,r]*U[bu1,id2,ru1]/(U[b,id2,r]*U[bu2,id1,ru2])) + F2 = projalg((U[b,id2,r]/(U[bd1,id2,rd1]*U[bdu,id1,rdu]))*U[bd1,id1,rd1]) + F3 = projalg((U[bd2,id2,rd2]\U[bd2,id1,rd2])*(U[bud,id2,rud]/U[b,id1,r])) + + frc[b,r,id1] -= F1 - F3 + frc[b,r,id2] += F1 - F2 + end + end + + return nothing +end + +function krnl_add_force_plns!(frc, fpl, lp::SpaceParm) + + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x @inbounds for ipl in 1:lp.npls id1, id2 = lp.plidx[ipl] - Xu1 = up(X, id1, lp) - Xu2 = up(X, id2, lp) - - plx[X] += tr(U[X, id1]*U[Xu1, id2] / (U[X, id2]*U[Xu2, id1])) - end - - return nothing -end - -function krnl_force_wilson_pln!(frc1, frc2, U, ipl, lp::SpaceParm, gp::GaugeParm) - - X = map2latt((CUDA.threadIdx().x,CUDA.threadIdx().y,CUDA.threadIdx().z), - (CUDA.blockIdx().x,CUDA.blockIdx().y,CUDA.blockIdx().z)) - - id1, id2 = lp.plidx[ipl] - Xu1 = up(X, id1, lp) - Xu2 = up(X, id2, lp) - - a = U[Xu1,id2]/U[Xu2,id1] - b = U[X ,id2]\U[X ,id1] - - F1 = projalg(U[X,id1]*a/U[X,id2]) - F2 = projalg(a*b) - F3 = projalg(b*a) - - @inbounds begin - frc1[X ,id1] -= F1 - frc1[X ,id2] += F1 - frc2[Xu1,id2] -= F2 - frc2[Xu2,id1] += F3 + frc[b,r,id1] += fpl[b,ipl,r,1,1] + fpl[b,ipl,r,1,2] + frc[b,r,id2] += fpl[b,ipl,r,2,1] + fpl[b,ipl,r,2,2] end return nothing end -function force0_wilson!(frc1, frc2, U, lp::SpaceParm, gp::GaugeParm, kp::KernelParm) +function force0_wilson_pln!(frc1, frc2, U, lp::SpaceParm) - zero!(frc1) - zero!(frc2) - for ipl in 1:lp.npls + zero!(frc1, lp) + zero!(frc2, lp) + for i in 1:lp.npls CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_force_wilson_pln!(frc1,frc2,U,ipl,lp,gp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_force_wilson_pln!(frc1,frc2,U,i,lp) end end return nothing end + +function force0_wilson!(frc1, U, lp::SpaceParm) + + zero!(frc1, lp) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_force_wilson!(frc1,U,lp) + end + + return nothing +end + +function force0_wilson_nw!(frc1, fpln, U, lp::SpaceParm) + + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz*lp.npls krnl_force_wilson_nw!(fpln,U,lp) + end + zero!(frc1, lp) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_add_force_plns!(frc1, fpln,lp) + end + + return nothing +end diff --git a/src/YM/YMfields.jl b/src/YM/YMfields.jl index 42a77bd..e36aeaf 100644 --- a/src/YM/YMfields.jl +++ b/src/YM/YMfields.jl @@ -9,154 +9,198 @@ ### created: Thu Jul 15 15:16:47 2021 ### +un(t) = t <: Union{Group, Complex} function field(::Type{T}, lp::SpaceParm) where {T <: Union{Group, Algebra}} - sz = lp.iL..., lp.ndim + sz = lp.bsz, lp.ndim, lp.rsz if (T == SU2) - As = StructArray{SU2}((ones(ComplexF64, sz), zeros(ComplexF64, sz))) +# As = StructArray{SU2}(undef, sz, unwrap=un) + return CuArray{SU2, 3}(undef, sz) elseif (T == SU2alg) - As = StructArray{SU2alg}((zeros(Float64, sz), - zeros(Float64, sz), - zeros(Float64, sz))) +# As = StructArray{SU2alg}(undef, sz, unwrap=un) + return CuArray{SU2alg, 3}(undef, sz) elseif (T == SU3) - As = StructArray{SU3}((ones(ComplexF64, sz), zeros(ComplexF64, sz), zeros(ComplexF64, sz), zeros(ComplexF64, sz), ones(ComplexF64, sz), zeros(ComplexF64, sz))) +# As = StructArray{SU3}(undef, sz, unwrap=un) + return CuArray{SU3, 3}(undef, sz) # As = Array{SU3, lp.ndim+1}(undef, sz) -# CUDA.@sync begin -# CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_SU3_zero!(As, lp) -# end elseif (T == SU3alg) - As = StructArray{SU3alg}((zeros(Float64, sz), - zeros(Float64, sz), - zeros(Float64, sz), - zeros(Float64, sz), - zeros(Float64, sz), - zeros(Float64, sz), - zeros(Float64, sz), - zeros(Float64, sz))) - + # As = StructArray{SU3alg}(undef, sz, unwrap=un) + return CuArray{SU3alg, 3}(undef, sz) # As = Array{SU3alg, lp.ndim+1}(undef, sz) -# CUDA.@sync begin -# CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_SU3alg_zero!(As, lp) -# end end return replace_storage(CuArray, As) + end -function randomn!(X) +function krnl_SU3_one!(G, lp::SpaceParm) - if (eltype(X) == SU2alg) - randn!(CURAND.default_rng(), LazyRows(X).t1) - randn!(CURAND.default_rng(), LazyRows(X).t2) - randn!(CURAND.default_rng(), LazyRows(X).t3) - elseif (eltype(X) == SU3alg) - randn!(CURAND.default_rng(), LazyRows(X).t1) - randn!(CURAND.default_rng(), LazyRows(X).t2) - randn!(CURAND.default_rng(), LazyRows(X).t3) - randn!(CURAND.default_rng(), LazyRows(X).t4) - randn!(CURAND.default_rng(), LazyRows(X).t5) - randn!(CURAND.default_rng(), LazyRows(X).t6) - randn!(CURAND.default_rng(), LazyRows(X).t7) - randn!(CURAND.default_rng(), LazyRows(X).t8) - end - return nothing -end - -function zero!(X) - - if (eltype(X) == SU2alg) - fill!(LazyRows(X).t1, 0.0) - fill!(LazyRows(X).t2, 0.0) - fill!(LazyRows(X).t3, 0.0) - end - - if (eltype(X) == SU3alg) - fill!(LazyRows(X).t1, 0.0) - fill!(LazyRows(X).t2, 0.0) - fill!(LazyRows(X).t3, 0.0) - fill!(LazyRows(X).t4, 0.0) - fill!(LazyRows(X).t5, 0.0) - fill!(LazyRows(X).t6, 0.0) - fill!(LazyRows(X).t7, 0.0) - fill!(LazyRows(X).t8, 0.0) -# CUDA.@sync begin -# CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_SU3alg_zero!(X, lp) -# end - end - - if (eltype(X) == SU2) - fill!(LazyRows(X).t1, complex(1.0)) - fill!(LazyRows(X).t2, complex(0.0)) - end - - if (eltype(X) == SU3) - fill!(LazyRows(X).u11, complex(1.0)) - fill!(LazyRows(X).u12, complex(0.0)) - fill!(LazyRows(X).u13, complex(0.0)) - fill!(LazyRows(X).u21, complex(0.0)) - fill!(LazyRows(X).u22, complex(1.0)) - fill!(LazyRows(X).u23, complex(0.0)) -# CUDA.@sync begin -# CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_SU3_zero!(X, lp) -# end - end - - return nothing -end - -function norm2(X) - - d = 0.0 - if (eltype(X) == SU2alg) - d = CUDA.mapreduce(x->x^2, +, LazyRows(X).t1) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t2) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t3) - elseif (eltype(X) == SU3alg) - d = CUDA.mapreduce(x->x^2, +, LazyRows(X).t1) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t2) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t3) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t4) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t5) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t6) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t7) + - CUDA.mapreduce(x->x^2, +, LazyRows(X).t8) -# d = CUDA.mapreduce(norm2, +, X) - end - - return d -end - -function krnl_SU3_zero!(G, lp::SpaceParm) - - X = map2latt((CUDA.threadIdx().x,CUDA.threadIdx().y,CUDA.threadIdx().z), - (CUDA.blockIdx().x,CUDA.blockIdx().y,CUDA.blockIdx().z)) + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x for id in 1:lp.ndim - G[X,id].u11 = complex(1.0) - G[X,id].u12 = complex(0.0) - G[X,id].u13 = complex(0.0) - G[X,id].u21 = complex(0.0) - G[X,id].u22 = complex(1.0) - G[X,id].u23 = complex(0.0) + G[b,id,r] = SU3(1.0,0.0,0.0,0.0,1.0,0.0) + end + return nothing +end + +function krnl_SU2_one!(G, lp::SpaceParm) + + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + for id in 1:lp.ndim + G[b,id,r] = SU2(1.0,0.0) end return nothing end function krnl_SU3alg_zero!(G, lp::SpaceParm) - X = map2latt((CUDA.threadIdx().x,CUDA.threadIdx().y,CUDA.threadIdx().z), - (CUDA.blockIdx().x,CUDA.blockIdx().y,CUDA.blockIdx().z)) + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x for id in 1:lp.ndim - G[X,id].t1 = 0.0 - G[X,id].t2 = 0.0 - G[X,id].t3 = 0.0 - G[X,id].t4 = 0.0 - G[X,id].t5 = 0.0 - G[X,id].t6 = 0.0 - G[X,id].t7 = 0.0 - G[X,id].t8 = 0.0 + G[b,id,r] = SU3alg(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0) end return nothing end + +function krnl_SU2alg_zero!(G, lp::SpaceParm) + + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + for id in 1:lp.ndim + G[b,id,r] = SU2alg(0.0,0.0,0.0) + end + return nothing +end + +function krnl_SU3alg_assign!(G, M, lp::SpaceParm) + + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + for id in 1:lp.ndim + G[b,id,r] = SU3alg(M[b,id,r,1], M[b,id,r,2], M[b,id,r,3], M[b,id,r,4], + M[b,id,r,5], M[b,id,r,6], M[b,id,r,7], M[b,id,r,8]) + end + return nothing +end + +function krnl_SU2alg_assign!(G, M, lp::SpaceParm) + + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + for id in 1:lp.ndim + G[b,id,r] = SU2alg(M[b,id,r,1], M[b,id,r,2], M[b,id,r,3]) + end + return nothing +end + +function randomn!(X, lp) + + if (eltype(X) == SU2alg) +# randn!(CURAND.default_rng(), LazyRows(X).t1) +# randn!(CURAND.default_rng(), LazyRows(X).t2) +# randn!(CURAND.default_rng(), LazyRows(X).t3) + M = CuArray{Float64}(undef, lp.bsz, lp.ndim, lp.rsz, 3) + randn!(CURAND.default_rng(), M) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_SU2alg_assign!(X, M, lp) + end + elseif (eltype(X) == SU3alg) +# randn!(CURAND.default_rng(), LazyRows(X).t1) +# randn!(CURAND.default_rng(), LazyRows(X).t2) +# randn!(CURAND.default_rng(), LazyRows(X).t3) +# randn!(CURAND.default_rng(), LazyRows(X).t4) +# randn!(CURAND.default_rng(), LazyRows(X).t5) +# randn!(CURAND.default_rng(), LazyRows(X).t6) +# randn!(CURAND.default_rng(), LazyRows(X).t7) +# randn!(CURAND.default_rng(), LazyRows(X).t8) + M = CuArray{Float64}(undef, lp.bsz, lp.ndim, lp.rsz, 8) + randn!(CURAND.default_rng(), M) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_SU3alg_assign!(X, M, lp) + end + end + return nothing +end + +function zero!(X, lp) + + if (eltype(X) == SU2alg) +# fill!(LazyRows(X).t1, 0.0) +# fill!(LazyRows(X).t2, 0.0) +# fill!(LazyRows(X).t3, 0.0) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_SU2alg_zero!(X, lp) + end + end + + if (eltype(X) == SU3alg) +# fill!(LazyRows(X).t1, 0.0) +# fill!(LazyRows(X).t2, 0.0) +# fill!(LazyRows(X).t3, 0.0) +# fill!(LazyRows(X).t4, 0.0) +# fill!(LazyRows(X).t5, 0.0) +# fill!(LazyRows(X).t6, 0.0) +# fill!(LazyRows(X).t7, 0.0) +# fill!(LazyRows(X).t8, 0.0) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_SU3alg_zero!(X, lp) + end + end + + if (eltype(X) == SU2) +# fill!(LazyRows(X).t1.re, 1.0) +# fill!(LazyRows(X).t1.im, 0.0) +# fill!(LazyRows(X).t2.re, 0.0) +# fill!(LazyRows(X).t2.im, 0.0) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_SU2_one!(X, lp) + end + end + + if (eltype(X) == SU3) +# fill!(LazyRows(X).u11.re, 1.0) + # fill!(LazyRows(X).u11.im, 0.0) + # fill!(LazyRows(X).u12.re, 0.0) +# fill!(LazyRows(X).u12.im, 0.0) +# fill!(LazyRows(X).u13.re, 0.0) +# fill!(LazyRows(X).u13.im, 0.0) +# fill!(LazyRows(X).u21.re, 0.0) +# fill!(LazyRows(X).u21.im, 0.0) +# fill!(LazyRows(X).u22.re, 1.0) +# fill!(LazyRows(X).u22.im, 0.0) +# fill!(LazyRows(X).u23.re, 0.0) +# fill!(LazyRows(X).u23.im, 0.0) + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_SU3_one!(X, lp) + end + end + + return nothing +end + +function norm_field(X) + + return CUDA.mapreduce(norm2, +, X) +# d = 0.0 + if (eltype(X) == SU2alg) +# d = CUDA.mapreduce(x->x^2, +, LazyRows(X).t1) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t2) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t3) + elseif (eltype(X) == SU3alg) +# d = CUDA.mapreduce(x->x^2, +, LazyRows(X).t1) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t2) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t3) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t4) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t5) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t6) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t7) + +# CUDA.mapreduce(x->x^2, +, LazyRows(X).t8) + d = CUDA.mapreduce(norm2, +, X) + end + + return d +end + + diff --git a/src/YM/YMhmc.jl b/src/YM/YMhmc.jl index 7e54c94..d210378 100644 --- a/src/YM/YMhmc.jl +++ b/src/YM/YMhmc.jl @@ -9,10 +9,10 @@ ### created: Thu Jul 15 11:27:28 2021 ### -function gauge_action(U, lp::SpaceParm, gp::GaugeParm, kp::KernelParm, ymws::YMworkspace) +function gauge_action(U, lp::SpaceParm, gp::GaugeParm, ymws::YMworkspace) CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_plaq!(ymws.cm, U, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_plaq!(ymws.cm, U, lp) end S = gp.beta*( prod(lp.iL)*lp.npls - CUDA.mapreduce(real, +, ymws.cm)/gp.ng ) @@ -20,32 +20,34 @@ function gauge_action(U, lp::SpaceParm, gp::GaugeParm, kp::KernelParm, ymws::YMw return S end -function plaquette(U, lp::SpaceParm, gp::GaugeParm, kp::KernelParm, ymws::YMworkspace) +function plaquette(U, lp::SpaceParm, gp::GaugeParm, ymws::YMworkspace) + println("Entro!") CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_plaq!(ymws.cm, U, lp) + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_plaq!(ymws.cm, U, lp) end + println("Salgo!") - return CUDA.mapreduce(real, +, real.(ymws.cm))/(prod(lp.iL)*lp.npls) + return CUDA.mapreduce(real, +, ymws.cm)/(prod(lp.iL)*lp.npls) end -function hamiltonian(mom, U, lp, gp, kp, ymws) - K = norm2(mom)/2.0 - V = gauge_action(U, lp, gp, kp, ymws) +function hamiltonian(mom, U, lp, gp, ymws) + K = norm_field(mom)/2.0 + V = gauge_action(U, lp, gp, ymws) println("K: ", K, " V: ", V) return K+V end -function HMC!(U, eps, ns, lp::SpaceParm, gp::GaugeParm, kp::KernelParm, ymws::YMworkspace; noacc=false) +function HMC!(U, eps, ns, lp::SpaceParm, gp::GaugeParm, ymws::YMworkspace; noacc=false) ymws.U1 .= U - randomn!(ymws.mom) - hini = hamiltonian(ymws.mom, U, lp, gp, kp, ymws) + randomn!(ymws.mom, lp) + hini = hamiltonian(ymws.mom, U, lp, gp, ymws) - OMF4!(ymws.mom, U, eps, ns, lp, gp, kp, ymws) + OMF4!(ymws.mom, U, eps, ns, lp, gp, ymws) - dh = hamiltonian(ymws.mom, U, lp, gp, kp, ymws) - hini + dh = hamiltonian(ymws.mom, U, lp, gp, ymws) - hini pacc = exp(-dh) acc = true @@ -66,18 +68,17 @@ end function krnl_updt!(mom, frc1, frc2, eps1, U, eps2, lp::SpaceParm) - X = map2latt((CUDA.threadIdx().x,CUDA.threadIdx().y,CUDA.threadIdx().z), - (CUDA.blockIdx().x,CUDA.blockIdx().y,CUDA.blockIdx().z)) + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x @inbounds for id in 1:lp.ndim - mom[X,id] = mom[X,id] + eps1 * (frc1[X,id]+frc2[X,id]) - U[X,id] = expm(U[X,id],mom[X,id], eps2) + mom[b,id,r] = mom[b,id,r] + eps1 * (frc1[b,id,r]+frc2[b,id,r]) + U[b,id,r] = expm(U[b,id,r], mom[b,id,r], eps2) end return nothing end -function OMF4!(mom, U, eps, ns, lp::SpaceParm, gp::GaugeParm, kp::KernelParm, ymws::YMworkspace) +function OMF4!(mom, U, eps, ns, lp::SpaceParm, gp::GaugeParm, ymws::YMworkspace) r1::Float64 = 0.08398315262876693 r2::Float64 = 0.2539785108410595 @@ -86,43 +87,78 @@ function OMF4!(mom, U, eps, ns, lp::SpaceParm, gp::GaugeParm, kp::KernelParm, ym r5::Float64 = 0.5-r1-r3 r6::Float64 = 1.0-2.0*(r2+r4) +# ee = eps*gp.beta/gp.ng +# @device_code_warntype force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp) +# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) +# zero!(ymws.frc2, lp) +# for i in 1:ns +# # STEP 1 +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r1*ee, U, eps*r2, lp) +# end +# +# # STEP 2 +# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r3*ee, U, eps*r4, lp) +# end +# +# # STEP 3 +# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r5*ee, U, eps*r6, lp) +# end +# +# # STEP 4 +# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r5*ee, U, eps*r4, lp) +# end +# +# # STEP 5 +# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r3*ee, U, eps*r2, lp) +# end +# +# # STEP 6 +# force0_wilson_pln!(ymws.frc1, ymws.frc2, U, lp) +# CUDA.@sync begin +# CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r1*ee, U, 0.0, lp) +# end +# end + ee = eps*gp.beta/gp.ng - force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp, kp) + force0_wilson!(ymws.frc1, U, lp) + zero!(ymws.frc2, lp) for i in 1:ns # STEP 1 - CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r1*ee, U, eps*r2, lp) - end - + mom .= mom .+ (r1*ee) .* (ymws.frc1 .+ ymws.frc2) + U .= expm.(U, mom, eps*r2) + # STEP 2 - force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp, kp) - CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r3*ee, U, eps*r4, lp) - end + force0_wilson!(ymws.frc1, U, lp) + mom .= mom .+ (r3*ee) .* (ymws.frc1 .+ ymws.frc2) + U .= expm.(U, mom, eps*r4) # STEP 3 - force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp, kp) - CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r5*ee, U, eps*r6, lp) - end + force0_wilson!(ymws.frc1, U, lp) + mom .= mom .+ (r5*ee) .* (ymws.frc1 .+ ymws.frc2) + U .= expm.(U, mom, eps*r6) # STEP 4 - force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp, kp) - CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r5*ee, U, eps*r4, lp) - end + force0_wilson!(ymws.frc1, U, lp) + mom .= mom .+ (r5*ee) .* (ymws.frc1 .+ ymws.frc2) + U .= expm.(U, mom, eps*r4) # STEP 5 - force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp, kp) - CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r3*ee, U, eps*r2, lp) - end + force0_wilson!(ymws.frc1, U, lp) + mom .= mom .+ (r3*ee) .* (ymws.frc1 .+ ymws.frc2) + U .= expm.(U, mom, eps*r2) # STEP 6 - force0_wilson!(ymws.frc1,ymws.frc2, U, lp, gp, kp) - CUDA.@sync begin - CUDA.@cuda threads=kp.threads blocks=kp.blocks krnl_updt!(ymws.mom, ymws.frc1,ymws.frc2, r1*ee, U, 0.0, lp) - end + force0_wilson!(ymws.frc1, U, lp) + mom .= mom .+ (r1*ee) .* (ymws.frc1 .+ ymws.frc2) end return nothing