Hi,
I am trying to simulate particles on CUDA. I have a bottleneck and I am wondering if some of you have any advice.
This is a MWE of a much more sophisticated example. Basically, I have a field of probabiblities distributions field
. For each space variable k1,k2,k3
, I have a probability distribution (un-normalised) field[k1,k2,k3,:]
.
I want to draw samples from field[k1,k2,k3,:]
under some condition cond
.
I thus first construct my probabilities probas[i] = field[k1,k2,k3,i] * cond[i]
and then I sample from probas
. Finally, I advance in the field
by changing k1,k2,k3
: this is the time step.
I want to do the above for many particles Nmc = 100_000
on the GPU, so I allocate a temporary probas = CUDA.zeros(1000, Nmc)
and fill it using a CUDA kernel. Finally, I use cumulative function distribut to sample from probas
.
All this seems to work fine but the performance are a bit disappointing. The perfs does not seems really good for the GPU compared to CPU:
- CPU = 700ms
- GPU = 480ms
Filling probas
seems to be the bottleneck: that all threads fills probas
.
Can one do better (shared memory or else)?
I could try to avoid using the temporary
probas
but that would mean recomputing twice the probabilities.
CUDA.@profile _sample_gpu!(result_g,
all_od_g,
cond_g,
probas_g,
)
Profiler ran for 491.04 ms, capturing 40 events.
Host-side activity: calling CUDA APIs took 52.21 Β΅s (0.01% of the trace)
ββββββββββββ¬βββββββββββββ¬ββββββββ¬βββββββββββββββββ
β Time (%) β Total time β Calls β Name β
ββββββββββββΌβββββββββββββΌββββββββΌβββββββββββββββββ€
β 0.01% β 46.01 Β΅s β 1 β cuLaunchKernel β
ββββββββββββ΄βββββββββββββ΄ββββββββ΄βββββββββββββββββ
Device-side activity: GPU was busy for 479.51 ms (97.65% of the trace)
ββββββββββββ¬βββββββββββββ¬ββββββββ¬ββββββββββββββββββββββββββββββββββββββββββ
β Time (%) β Total time β Calls β Name β―
ββββββββββββΌβββββββββββββΌββββββββΌββββββββββββββββββββββββββββββββββββββββββ
β 97.65% β 479.51 ms β 1 β gpu__sample_knl_(CompilerMetadata<Dynam β―
ββββββββββββ΄βββββββββββββ΄ββββββββ΄ββββββββββββββββββββββββββββββββββββββββββ
using Revise, LinearAlgebra
using CUDA
using KernelAbstractions
function _sample_gpu!(result,
field,
cond,
probas;
Nnmc = 1000,
)
Nnmc = size(probas, 2)
npb = size(field, 4)
# launch gpu kernel
backend = get_backend(result)
nth = backend isa KernelAbstractions.GPU ? 1024 : 8
kernel! = _sample_knl!(backend, nth)
kernel!(result,
(field),
cond,
probas,
npb,
ndrange = Nnmc
)
result
end
@kernel function _sample_knl!(result::AbstractArray{π―, 3},
@Const(field),
@Const(cond),
probas,
nd,
) where π―
nββ = @index(Global)
kβ = kβ = kβ = 1
xβ = xβ = xβ = zero(π―)
# number of time steps
nβ = size(result, 3)
nβ = size(probas, 1)
result[1, nββ, 1] = xβ
result[2, nββ, 1] = xβ
result[3, nββ, 1] = xβ
result[4, nββ, 1] = 0
# compute argmax of field[kβ, kβ, kβ, :]
_val_max::Float32 = 0f0
ind_u = 0
# compute
@inbounds for iβ = 2:10#nβ + 1
kβ = kβ = kβ = Int(round(iβ / nβ) * 100 + 1)
# sampling
total_proba = proba = zero(π―)
conditioned_proba = proba_max = zero(π―)
ind_max = 0
@inbounds for ii in axes(probas, 1)
val_field = field[kβ, kβ, kβ, ii]
proba0 = max(0, val_field)
proba = proba0 * cond[ii, ind_u]
# keep track of conditional probabilities
conditioned_proba += proba
total_proba += proba0
probas[ii, nββ] = proba
end
ind_u = 1
t = rand(π―) * conditioned_proba
cw = probas[1, nββ]
while cw <= t && ind_u < nβ
ind_u += 1
cw += probas[ind_u, nββ]
end
result[1, nββ, iβ] = nββ
# save rv
result[2, nββ, iβ] = ind_u
end
end
Nmc = 100_000
all_od_g = CUDA.rand(Float32,101,108,101, 1000);
cond_g = cu(rand(0:1, 1000, 1000))
all_od = Array(all_od_g);
result_g = CUDA.zeros(Float32, 4, Nmc, 1000)
probas_g = CUDA.zeros(Float32, size(all_od_g, 4), Nmc)
result = Array(result_g)
probas = Array(probas_g)
cond = Array(cond_g)
res_a = _sample_gpu!(result,
all_od,
cond,
probas,
)
res_g = _sample_gpu!(result_g,
all_od_g,
cond_g,
probas_g,
)
CUDA.@profile _sample_gpu!(result_g,
all_od_g,
cond_g,
probas_g,
)