Hellos,
I noticed that with one line definition for ArrayFire.jl
I can make it run together with DifferentialEquations.jl
, but it is not working as expected.
DifferentialEquations is calling a generic multiplication function generic_matvecmul!
instead of using ArrayFire *(matmul)
and I would like to know how to fix it (if possible) to get all the performance of the GPU.
Here a minimal example of my problem:
# Don't worry about this part
# It is necessary to define the matrix "M" below
function _sph2cart(θ, ϕ, r)
N = length(θ)
x = Array{Float64}(undef, N)
y = Array{Float64}(undef, N)
z = Array{Float64}(undef, N)
x = r*cos(θ)*cos(ϕ)
y = r*sin(θ)*cos(ϕ)
z = r*sin(ϕ)
return x,y,z
end
function _RandPoints_Sphere(N::Int64, Radius, MinDist )
X = Array{Float64}(undef, N)
Y = Array{Float64}(undef, N)
Z = Array{Float64}(undef, N)
dist_3 = MinDist^3
iLoop = 1
nValid = 0
while nValid < N && iLoop < 10^6
rvals = 2*rand() - 1
elevation = asin(rvals)
azimuth = 2π*rand()
radii = Radius*(rand()^(1 ./3.))
newX, newY, newZ = _sph2cart(azimuth, elevation, radii)
if all(( (X[1:nValid].- newX).^2 + (Y[1:nValid].- newY).^2 + (Z[1:nValid].- newZ).^2 ) .>= dist_3)
nValid = nValid + 1
X[nValid] = newX
Y[nValid] = newY
Z[nValid] = newZ
end
iLoop = iLoop + 1
end
if nValid < N
error("Could not generate all data after 10M interactions")
end
X_jk = X*ones(1,N) - ones(N,1)*X';
Y_jk = Y*ones(1,N) - ones(N,1)*Y';
Z_jk = Z*ones(1,N) - ones(N,1)*Z';
R_jk = sqrt.(X_jk.^2 + Y_jk.^2 + Z_jk.^2);
X_jk = 1;Y_jk=1; Z_jk=1; GC.gc()
return [X Y Z],R_jk
end
# --------------------------------------------
using ArrayFire, BenchmarkTools, DifferentialEquations, LinearAlgebra
# We need to define any() with AFArray
Base.any(f::Function, a::AFArray) = Bool(sum(f.(a)))
# Minimal code for what I want to do
function Foo!(du,u,p,t)
M = p[1]
Ω = p[2]
du[:] = M*u + Ω
end
# just another attempt
Goo(u, p,t) = M_gpu*u + Ω_gpu
N = 10 # matrix size
r, R_jk = _RandPoints_Sphere(N, 10, 0 )
M = Array{Complex{Float64}}(undef, N,N)
@. M = -(1/2)*exp(im*R_jk)/(im*R_jk)
M[diagind(M)] .= 1im*1 - 1/2
Ω = -(im/2)*exp.(1im.*r[:,3])
M_gpu = AFArray(M)
Ω_gpu = AFArray(Ω)
p = []
push!(p, M)
push!(p, Ω)
p_gpu = []
push!(p_gpu, M_gpu)
push!(p_gpu, Ω_gpu)
println("----")
u0 = zeros(ComplexF64,length(Ω))
tspan = (0.0,100.0)
prob = ODEProblem(Foo!,u0,tspan,p)
@btime sol = DifferentialEquations.solve(prob,Tsit5())
u0_gpu = AFArray(zeros(ComplexF64,length(Ω)))
tspan = (0.0,100.0)
prob_gpu1 = ODEProblem(Foo!,u0,tspan, p_gpu)
@btime sol_gpu1 = DifferentialEquations.solve(prob_gpu1,Tsit5())
u0_gpu = AFArray(zeros(ComplexF64,length(Ω)))
tspan = (0.0,100.0)
prob_gpu2 = ODEProblem(Goo,u0,tspan)
@btime sol_gpu2 = DifferentialEquations.solve(prob_gpu2,Tsit5())
Results:
ArrayFire v3.6.4 (OpenCL, 64-bit Linux, build 1b8030c)
[0] AMD: gfx900, 8176 MB
----
158.905 μs (1412 allocations: 293.16 KiB)
830.935 ms (91112 allocations: 5.08 MiB)
825.654 ms (99245 allocations: 5.99 MiB)