Hello, I am trying to write a ODE solver on the GPU. I am solving the same simple ODE but for different initial data. I tried the EnsembleGPUArray
option from DiffEqGPU.jl but it was quite slow.
The kernel I wrote is reading three CellArray
objects, where one contains the state vectors, so the cell is of type SVector{5,Float32}
, the other one is just a register for the Euler step, so a copy of the first one, and the last one contains parameters and is of type SVector{3,Float32}
. Using these objects, it performs only one Euler step. The RHS of my ODE for one state vector on the CPU takes about 70ns.
The effective bandwidth on my Tesla K40c goes like:
The saxpy kernel
function _saxpy_kernel!(a, x, y)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i <= length(y)
@inbounds y[i] = a * x[i] + y[i]
end
return nothing
end
gives a Teff≈ 178 GB/s. So the ODE kernel is much slower.
Could you please help me understand what I’m doing, or even thinking, wrong?
function evolve_gpu_kernel!(photons,photon_params,RHS,time_stepper,params)
N,M,tmps,dt = params
ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x
iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y
if ix <= N && iy <= M
photons[ix,iy] = time_stepper(RHS,tmps[ix,iy],photons[ix,iy],photon_params[ix,iy],dt)
end
return
end
function Euler(rhs,du,u,p,dt)
du = rhs(u,p)
return u .+ du .* dt
end
function run(p,nthreads=nothing)
metric,N,M = p
θ0 = deg2rad(87.5)
alims = (-9,9) ### pθ/pt impact param
blims = (-10,10) ### -pφ/pt impact param
pos = SVector{D,T}([0,init_r,θ0,π/2])
inputs,ps = generate_rays_gpu(pos, metric, N, M, alims, blims)
photons_tmp = Matrix(reduce(hcat,vec(inputs))') |> cu
photon_params_tmp = Matrix(reduce(hcat,vec(ps))') |> cu
photons = CuCellArray{SVector{5,Float32}}(undef,N,M)
tmp = CuCellArray{SVector{5,Float32}}(undef,N,M)
photon_params = CuCellArray{SVector{3,Float32}}(undef,N,M)
photons.data .= photons_tmp
photon_params.data .= photon_params_tmp
CUDA.unsafe_free!(photons_tmp)
photons_tmp = nothing
CUDA.unsafe_free!(photon_params_tmp)
photon_params_tmp = nothing
RHS = kerr_geod
time_stepper = Euler
dt = 0.01f0
if isnothing(nthreads)
MAX_THREADS_PER_BLOCK = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
nthreads = (isqrt(MAX_THREADS_PER_BLOCK), isqrt(MAX_THREADS_PER_BLOCK))
end
nblocks = cld.((N,M),nthreads)
params = (N,M,tmp,dt)
t = CUDA.@elapsed CUDA.@sync @cuda(threads=nthreads,blocks=nblocks,
evolve_gpu_kernel!(photons,photon_params,RHS,time_stepper,params))
return t
end
function plot_diagnostic()
idxs = [i for i in 4:10]
x = 2 .^ idxs
labels = [latexstring("2^{$idx}") for idx in idxs]
time = n -> run((kerr_metric_bl,n,n),(16,16));
t = time.(x)
Aeff(N) = (2*5+3)*1e-9*N^2*sizeof(Float32) ## GB
Teff = @. Aeff(x)/t
plot(x,t .* 1e3,
xlabel="N", ylabel="Wall Time [ms]",
label=nothing, tickfontsize = 14, dpi=144,
xticks=(x,labels),xaxis=:log2)
scatter!(x,t .* 1e3,label=nothing)
savefig("wall_times.png")
plot(x,Teff,
xlabel="N", ylabel=L"$T_{\mathrm{eff}}$ [GB/s]",
label=nothing,
xticks=(x,labels),xaxis=:log2)
scatter!(x,Teff,label=nothing)
savefig("Teff.png")
end