I am trying my first GPU kernel, which essentially boils down to the simplified functions below. I barely get a x1.7 speed up with respect to the multi-threaded version so I’m wondering whether/how the performance of the GPU version can be improved.
using CUDA
N = Int(2^20)
F = fill(false, N)
Q = fill(true, N)
d = rand(N)
Fd, Qd = CuArray(F), CuArray(Q)
dd = CuArray(d)
Δ = rand()
function update!(F, Q, dist, Δ)
@inbounds for i in eachindex(F)
F[i] = false
if (Q[i] == true) && (dist[i] ≤ Δ)
F[i] = true
end
end
end
function update_thread!(F, Q, dist, Δ)
Threads.@threads for i in eachindex(F)
@inbounds F[i] = false
@inbounds if (Q[i] == true) && (dist[i] ≤ Δ)
F[i] = true
end
end
end
function _gpu_update!(F, Q, dist, Δ)
index = threadIdx().x
stride = blockDim().x
@inbounds for i = index:stride:length(F)
F[i] = false
if (Q[i] == true) && (dist[i] ≤ Δ)
F[i] = true
end
end
return nothing
end
function gpu_update!(F::CuArray{Bool}, Q::CuArray{Bool}, dist::CuArray{T}, Δ) where T
CUDA.@sync begin
@cuda threads = 1024 _gpu_update!(F, Q, dist, Δ)
end
end
@btime update!($F, $Q, $d, $Δ); # 4.062 ms (0 allocations: 0 bytes)
@btime update_thread!($F, $Q, $d, $Δ); # 1.005 ms (20 allocations: 1.94 KiB) on 4 threads
@btime gpu_update!($Fd, $Qd, $dd, $Δ); # 579.145 μs (179 allocations: 5.77 KiB)