latticegpu.jl/src/Scalar/ScalarObs.jl

###
### "THE BEER-WARE LICENSE":
### Alberto Ramos wrote this file. As long as you retain this
### notice you can do whatever you want with this stuff. If we meet some
### day, and you think this stuff is worth it, you can buy me a beer in
### return. <>
###
### file:    YMact.jl
### created: Mon Jul 12 18:31:19 2021
###


"""
    computes global observables by calling krnl_obs! and summing
    for all lattice points
"""

function scalar_obs(U, Phi, sp::ScalarParm, lp::SpaceParm, ymws::YMworkspace)

    @timeit "Scalar observables" begin
        CUDA.@sync begin
            CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_obs!(ymws.rm, ymws.cm, U, Phi, sp, lp)
        end

        V = prod(lp.iL)
        #summation of global observables
        rho2   = CUDA.mapreduce(norm2, +, Phi)/V
        lphi   = CUDA.reduce(+, ymws.rm)/(lp.ndim*V)
        lalpha = CUDA.mapreduce(real, +, ymws.cm)/(lp.ndim*V)
    end

    return rho2, lphi, lalpha
end

"""
    CUDA function to compute the observables defined in the Obs struct
    for each lattice point
"""

function krnl_obs!(rm, cm, U::AbstractArray{TG}, Phi::AbstractArray{TS}, sp::ScalarParm{NP,T}, lp::SpaceParm{N,M,D}) where {TG,TS,NP,T,N,M,D}

    #thread/block coordinate
    b, r = CUDA.threadIdx().x, CUDA.blockIdx().x

    Ush = @cuStaticSharedMem(TG, (D,N))
    Psh = @cuStaticSharedMem(TS, (D,NP))

    for id in 1:N
        Ush[b,id] = U[b,id,r]
    end
    for i in 1:NP
        Psh[b,i] = Phi[b,i,r]
    end
    sync_threads()

    IX = point_coord((b,r), lp)

    rm[IX] = zero(eltype(rm))
    cm[IX] = zero(eltype(cm))
    #compute obs
    for i in 1:NP
        psq = norm( Psh[b,i] )
        for id in 1:N
            bu, ru = up((b, r), id, lp) #thread/block coordinate of up point
            if (ru == r)
                phiup = Psh[bu,i]
            else
                phiup = Phi[bu,i,ru]
            end

            psqup = norm(phiup)

            rm[IX] += dot( Psh[b,i], Ush[b,id]*phiup )
            cm[IX] += complex(rm[IX])/(psq*psqup)
        end
    end

    return nothing
end