Help improving particle simulation on GPU

Hi,

I am trying to simulate particles on CUDA. I have a bottleneck and I am wondering if some of you have any advice.

This is a MWE of a much more sophisticated example. Basically, I have a field of probabiblities distributions field. For each space variable k1,k2,k3, I have a probability distribution (un-normalised) field[k1,k2,k3,:].

I want to draw samples from field[k1,k2,k3,:] under some condition cond.
I thus first construct my probabilities probas[i] = field[k1,k2,k3,i] * cond[i] and then I sample from probas. Finally, I advance in the field by changing k1,k2,k3: this is the time step.

I want to do the above for many particles Nmc = 100_000 on the GPU, so I allocate a temporary probas = CUDA.zeros(1000, Nmc) and fill it using a CUDA kernel. Finally, I use cumulative function distribut to sample from probas.

All this seems to work fine but the performance are a bit disappointing. The perfs does not seems really good for the GPU compared to CPU:

  • CPU = 700ms
  • GPU = 480ms

Filling probas seems to be the bottleneck: that all threads fills probas.
Can one do better (shared memory or else)?

I could try to avoid using the temporary probas but that would mean recomputing twice the probabilities.


CUDA.@profile _sample_gpu!(result_g,
                        all_od_g,
                        cond_g,
                        probas_g,
                        )

Profiler ran for 491.04 ms, capturing 40 events.

Host-side activity: calling CUDA APIs took 52.21 Β΅s (0.01% of the trace)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Time (%) β”‚ Total time β”‚ Calls β”‚ Name           β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚    0.01% β”‚   46.01 Β΅s β”‚     1 β”‚ cuLaunchKernel β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

Device-side activity: GPU was busy for 479.51 ms (97.65% of the trace)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Time (%) β”‚ Total time β”‚ Calls β”‚ Name                                    β‹―
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚   97.65% β”‚  479.51 ms β”‚     1 β”‚ gpu__sample_knl_(CompilerMetadata<Dynam β‹―
└──────────┴────────────┴───────┴──────────────────────────────────────────
        
using Revise, LinearAlgebra
using CUDA
using KernelAbstractions

function _sample_gpu!(result,
                    field,
                    cond,
                    probas;
                    Nnmc = 1000,
                    )
    Nnmc = size(probas, 2)
    npb = size(field, 4)
    # launch gpu kernel
    backend = get_backend(result)
    nth = backend isa KernelAbstractions.GPU ? 1024 : 8
    kernel! = _sample_knl!(backend, nth)    
    kernel!(result,
            (field),
            cond,
            probas,
            npb,
            ndrange = Nnmc
    )
    result
    
end

@kernel function _sample_knl!(result::AbstractArray{𝒯, 3},
                                @Const(field),
                                @Const(cond),
                                probas,
                                nd,
                                ) where 𝒯
    nβ‚™β‚˜ = @index(Global)
    k₁ = kβ‚‚ = k₃ = 1
    x₁ = xβ‚‚ = x₃ = zero(𝒯)

    # number of time steps
    nβ‚œ = size(result, 3)
    nβ‚š = size(probas, 1)

    result[1, nβ‚™β‚˜, 1] = x₁
    result[2, nβ‚™β‚˜, 1] = xβ‚‚
    result[3, nβ‚™β‚˜, 1] = x₃
    result[4, nβ‚™β‚˜, 1] = 0

    # compute argmax of field[k₁, kβ‚‚, k₃, :]
    _val_max::Float32 = 0f0
    ind_u = 0
    # compute 
    @inbounds for iβ‚œ = 2:10#nβ‚œ + 1
        k₁ = kβ‚‚ = k₃ = Int(round(iβ‚œ / nβ‚œ) * 100 + 1)

        # sampling
        total_proba = proba = zero(𝒯)
        conditioned_proba = proba_max = zero(𝒯)
        ind_max = 0
        @inbounds for ii in axes(probas, 1)
            val_field = field[k₁, kβ‚‚, k₃, ii]
            proba0 = max(0, val_field)
            proba = proba0 * cond[ii, ind_u]
            # keep track of conditional probabilities
            conditioned_proba += proba
            total_proba += proba0
            probas[ii, nβ‚™β‚˜] = proba
        end

        ind_u = 1
        t = rand(𝒯) * conditioned_proba
        cw = probas[1, nβ‚™β‚˜]
        while cw <= t && ind_u < nβ‚š 
            ind_u += 1
            cw += probas[ind_u, nβ‚™β‚˜]
        end

        result[1, nβ‚™β‚˜, iβ‚œ] = nβ‚™β‚˜
        # save rv
        result[2, nβ‚™β‚˜, iβ‚œ] = ind_u
    end
    
end


Nmc = 100_000
all_od_g = CUDA.rand(Float32,101,108,101, 1000);
cond_g = cu(rand(0:1, 1000, 1000))

all_od = Array(all_od_g);
result_g = CUDA.zeros(Float32, 4, Nmc, 1000)
probas_g = CUDA.zeros(Float32, size(all_od_g, 4), Nmc)

result = Array(result_g)
probas = Array(probas_g)
cond = Array(cond_g)
res_a = _sample_gpu!(result,
                    all_od,
                    cond,
                    probas,
                )

res_g =  _sample_gpu!(result_g,
            all_od_g,
            cond_g,
            probas_g,
            )


CUDA.@profile _sample_gpu!(result_g,
                        all_od_g,
                        cond_g,
                        probas_g,
                        )
2 Likes