Parallelization of estimation loop or using GPU for bayesian neural ODE

I would like to improve the speed process of this following loop using wether gpu or CPU. in case using CPU, What is the best way nowadays to parallelize it ? Could you please help to include it in the code ?
Regards

for i in 1:58
    t=info[i,10]
    q0=info[i,9]
    N=info[i,7]
    TAU=info[i,6]
    truth=prod[1:convert(Int64,t),i+1]
    tspan=(1,t) 
    tsteps = 1:1:t
    u0 = [q0]
    p=[N,TAU]
    prob1  = ODEProblem(SEM!,u0,tspan,p)
    ode_data= solve(prob1,Tsit5(),saveat = tsteps)
    pl=scatter(tsteps,ode_data[1,:],color = :red, label = "Proxy Fitting", xlabel = "t", title = " Neural ODE",dpi=300)
    #plot!(tsteps,ode_data_T[1,:],color = :blue, label = "Turth Arps Fitting")
    plot!(tsteps,truth,dpi=300)
    ode_data =0.001*ode_data
    dudt= FastChain(FastDense(1, 40, tanh),
                      FastDense(40, 1))

    prob_neuralode = NeuralODE(dudt, tspan, Tsit5(), saveat = tsteps)

    function predict_neuralode(p)
        Array(prob_neuralode(u0*0.001, p))
    end

    function loss_neuralode(p)
        pred = predict_neuralode(p)
        loss = sum(abs2, ode_data .- pred)
        return loss, pred
    end

    l(θ) = -sum(abs2, ode_data .- predict_neuralode(θ)) - sum(θ .* θ)


    function dldθ(θ)
        x,lambda = Flux.Zygote.pullback(l,θ)
        grad = first(lambda(1))
        return x, grad
    end

    metric  = DiagEuclideanMetric(length(prob_neuralode.p))

    h = Hamiltonian(metric, l, dldθ)


    integrator = Leapfrog(find_good_stepsize(h, Float64.(prob_neuralode.p)))


    prop = AdvancedHMC.NUTS{MultinomialTS, GeneralisedNoUTurn}(integrator)

    adaptor = StanHMCAdaptor(MassMatrixAdaptor(metric), StepSizeAdaptor(0.8, prop.integrator))

    samples, stats = sample(h, prop, Float64.(prob_neuralode.p), 500, adaptor, 200; progress=true)


    losses = map(x-> x[1],[loss_neuralode(samples[i]) for i in 1:length(samples)])

    ##################### PLOTS: LOSSES ###############
    #scatter(losses, ylabel = "Loss",  yscale= :log, label = "Architecture1: 500 warmup, 500 sample")

    ################### RETRODICTED PLOTS: TIME SERIES #################
    #pl = scatter(tsteps, ode_data, color = :red, label = "Data: Var1", xlabel = "t", title = "Spiral Neural ODE")
    pl=scatter(tsteps,truth,color = :blue, label = "Actual", xlabel = "Time (months)",
        ylabel = "Oil Rate (STB/month)",title = "SEDM Neural ODE")
    scatter!(tsteps,ode_data[1,:]/0.001,color = :red, label = "SEDM Fitting",dpi=300)
    for k in 1:500
        resol = predict_neuralode(samples[100:end][rand(1:400)])
        plot!(tsteps,resol[1,:]/0.001, alpha=0.04, color = :red, label = "")
    end

    idx = findmin(losses)[2]
    prediction = predict_neuralode(samples[idx])
    plot!(tsteps,prediction[1,:]/0.001, color = :black, w = 2, label = "Best fit prediction",dpi=300)
    savefig("C:/Julia_data/ID"*string(i)*".pdf")
end

info is not defined. But I can almost certainly guarantee that GPUs will not be faster in this case.