diff --git a/src/Scalar/ScalarForce.jl b/src/Scalar/ScalarForce.jl index be5c25a..e747660 100644 --- a/src/Scalar/ScalarForce.jl +++ b/src/Scalar/ScalarForce.jl @@ -106,11 +106,11 @@ function krnl_force_scalar!(fgauge, fscalar, U::AbstractArray{TG}, Phi::Abstract sdot2 = dot(Psh[b,2],Psh[b,2]) sdot12 = dot(Psh[b,1],Psh[b,2]) - fscalar[b,1,r] -= 2 * (1 + 2*sp.eta[1]*(sdot1-1) + sp.xi[1]*sdot2 + 2*sp.xi[3]*sdot12) * Psh[b,1] - fscalar[b,2,r] -= 2 * (1 + 2*sp.eta[2]*(sdot2-1) + sp.xi[1]*sdot1 + 2*sp.xi[4]*sdot12) * Psh[b,2] + fscalar[b,1,r] -= (2 * (1 + 2*sp.eta[1]*(sdot1-1) + sp.xi[1]*sdot2 + 2*sp.xi[3]*sdot12)) * Psh[b,1] + fscalar[b,2,r] -= (2 * (1 + 2*sp.eta[2]*(sdot2-1) + sp.xi[1]*sdot1 + 2*sp.xi[4]*sdot12)) * Psh[b,2] - fscalar[b,1,r] -= 2 * (sp.muh + sp.xi[2]*sdot12 + sp.xi[3]*sdot1 + sp.xi[4]*sdot2) * Psh[b,2] - fscalar[b,2,r] -= 2 * (sp.muh + sp.xi[2]*sdot12 + sp.xi[3]*sdot1 + sp.xi[4]*sdot2) * Psh[b,1] + fscalar[b,1,r] -= (2 * (sp.muh + sp.xi[2]*sdot12 + sp.xi[3]*sdot1 + sp.xi[4]*sdot2)) * Psh[b,2] + fscalar[b,2,r] -= (2 * (sp.muh + sp.xi[2]*sdot12 + sp.xi[3]*sdot1 + sp.xi[4]*sdot2)) * Psh[b,1] return nothing end diff --git a/src/Scalar/ScalarObs.jl b/src/Scalar/ScalarObs.jl index f5173fb..1b415c9 100644 --- a/src/Scalar/ScalarObs.jl +++ b/src/Scalar/ScalarObs.jl @@ -15,13 +15,30 @@ for all lattice points """ -function scalar_obs(U, Phi, sp::ScalarParm, lp::SpaceParm, ymws::YMworkspace) +function scalar_obs(U, Phi, sp::ScalarParm{NP,T}, lp::SpaceParm, ymws::YMworkspace) where {NP,T} @timeit "Scalar observables" begin CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_obs!(ymws.rm, ymws.cm, U, Phi, sp, lp) end + V = prod(lp.iL) + #summation of global observables + rho2 = CUDA.mapreduce(norm2, +, Phi)/(V*NP) + lphi = CUDA.reduce(+, ymws.rm)/(lp.ndim*V*NP) + lalpha = CUDA.mapreduce(real, +, ymws.cm)/(lp.ndim*V*NP) + end + + return rho2, lphi, lalpha +end + +function scalar_obs(U, Phi, isc::Int64, sp::ScalarParm{NP,T}, lp::SpaceParm, ymws::YMworkspace) where {NP,T} + + @timeit "Scalar observables" begin + CUDA.@sync begin + CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_obs!(ymws.rm, ymws.cm, U, Phi, isc, sp, lp) + end + V = prod(lp.iL) #summation of global observables rho2 = CUDA.mapreduce(norm2, +, Phi)/V @@ -61,7 +78,7 @@ function krnl_obs!(rm, cm, U::AbstractArray{TG}, Phi::AbstractArray{TS}, sp::Sca for i in 1:NP psq = norm( Psh[b,i] ) for id in 1:N - bu, ru = up((b, r), id, lp) #thread/block coordinate of up point + bu, ru = up((b, r), id, lp) if (ru == r) phiup = Psh[bu,i] else @@ -78,3 +95,40 @@ function krnl_obs!(rm, cm, U::AbstractArray{TG}, Phi::AbstractArray{TS}, sp::Sca return nothing end +function krnl_obs!(rm, cm, U::AbstractArray{TG}, Phi::AbstractArray{TS}, isc::Int64, sp::ScalarParm{NP,T}, lp::SpaceParm{N,M,D}) where {TG,TS,NP,T,N,M,D} + + #thread/block coordinate + b, r = CUDA.threadIdx().x, CUDA.blockIdx().x + + Ush = @cuStaticSharedMem(TG, (D,N)) + Psh = @cuStaticSharedMem(TS, D) + + for id in 1:N + Ush[b,id] = U[b,id,r] + end + Psh[b] = Phi[b,isc,r] + sync_threads() + + IX = point_coord((b,r), lp) + + rm[IX] = zero(eltype(rm)) + cm[IX] = zero(eltype(cm)) + #compute obs + psq = norm( Psh[b] ) + for id in 1:N + bu, ru = up((b, r), id, lp) + if (ru == r) + phiup = Psh[bu] + else + phiup = Phi[bu,isc,ru] + end + + psqup = norm(phiup) + + rm[IX] += dot( Psh[b], Ush[b,id]*phiup ) + cm[IX] += complex(rm[IX])/(psq*psqup) + end + + return nothing +end +