Saving values from GPU during Euler stepping in ODE

Okay I got it a bit faster:

Initial version: 2.363006s

1.0s → Insert forgotten points (meaning only partially fused kernels, and tmp allocations)
0.7s → using sum(x) instead of sum(x, 1), possibly doesn’t generalize anymore? But the current version was 1D anyways
0.4s → moving everything into one gpu kernel

final version:


function clstep!(state, randstate, ratesum, kappa, y_current, rates, dt)
    i = @linearidx(y_current, state)
    yi = y_current[i] # create tmp variables to not needlessly access array multiple times
    ytmp = yi + dt * (F(yi) + kappa)
    rate = (rates[i] = f(ytmp))
    rnd = GPUArrays.gpu_rand(Float32, state, randstate)
    y_current[i] = ytmp * (rate < -log(rnd) / dt)
    return
end

function euler_cl(T::TY,dt::TY,n::Int32,y0, plot_ = false)
    println("\n--> Euler algo, up to T = ",n*dt)
    n = min(n,Int(ceil(T/dt)))
	println("OK")
    y_current = copy(y0)
    rndcl     = copy(y0) # to hold randomness
    rates     = f.(y_current)
	println("OK")
    kappa     = sqrt.(2 * H0 * sum(rates)/N) .* w
    nb_jumps  = 0
	y_current .= y_current .+ dt .* (F.(y_current) .+ kappa)
	println("OK for Euler step")

    # variables for saving data
    tout = TY[]
    a = TY[]
    randstate = GPUArrays.cached_state(y_current)
    for ii=1:n
        ratesum = sum(rates)
        kappa = sqrt(2f0 * H0 * ratesum ./ N) .* w
        gpu_call(clstep!, y_current, (randstate, ratesum, kappa, y_current, rates, dt))
        # save data
        push!(tout, ii*dt)
    end
    return tout, a
end

#######################################################################################
xc0 = rand(TY,N) |> CLArray;

parms = TY.([0., 0.])
dt = TY(1. / (5.4N))
t,a_tot = euler_cl(TY(5.5),dt*2,Int32(10),xc0);


t,a_tot = @time euler_cl(TY(5.5), dt,Int32(1000),xc0);

I removed the push!(a, kappa) since there was a type mismatch… I guess that shouldn’t change anything for the benchmarks…

Best,
Simon

2 Likes