Okay I got it a bit faster:
Initial version: 2.363006s
1.0s → Insert forgotten points (meaning only partially fused kernels, and tmp allocations)
0.7s → using sum(x) instead of sum(x, 1), possibly doesn’t generalize anymore? But the current version was 1D anyways
0.4s → moving everything into one gpu kernel
final version:
function clstep!(state, randstate, ratesum, kappa, y_current, rates, dt)
i = @linearidx(y_current, state)
yi = y_current[i] # create tmp variables to not needlessly access array multiple times
ytmp = yi + dt * (F(yi) + kappa)
rate = (rates[i] = f(ytmp))
rnd = GPUArrays.gpu_rand(Float32, state, randstate)
y_current[i] = ytmp * (rate < -log(rnd) / dt)
return
end
function euler_cl(T::TY,dt::TY,n::Int32,y0, plot_ = false)
println("\n--> Euler algo, up to T = ",n*dt)
n = min(n,Int(ceil(T/dt)))
println("OK")
y_current = copy(y0)
rndcl = copy(y0) # to hold randomness
rates = f.(y_current)
println("OK")
kappa = sqrt.(2 * H0 * sum(rates)/N) .* w
nb_jumps = 0
y_current .= y_current .+ dt .* (F.(y_current) .+ kappa)
println("OK for Euler step")
# variables for saving data
tout = TY[]
a = TY[]
randstate = GPUArrays.cached_state(y_current)
for ii=1:n
ratesum = sum(rates)
kappa = sqrt(2f0 * H0 * ratesum ./ N) .* w
gpu_call(clstep!, y_current, (randstate, ratesum, kappa, y_current, rates, dt))
# save data
push!(tout, ii*dt)
end
return tout, a
end
#######################################################################################
xc0 = rand(TY,N) |> CLArray;
parms = TY.([0., 0.])
dt = TY(1. / (5.4N))
t,a_tot = euler_cl(TY(5.5),dt*2,Int32(10),xc0);
t,a_tot = @time euler_cl(TY(5.5), dt,Int32(1000),xc0);
I removed the push!(a, kappa)
since there was a type mismatch… I guess that shouldn’t change anything for the benchmarks…
Best,
Simon