Hello,
As part of learning to use CUDA in Julia, I wrote the following MWE
to compare threaded CPU, CUDA array programming, and CUDA kernel programming performance for a simple element by element array multiplication.
As expected, the CUDA array programming code is significantly faster than the nested 3 loops threaded CPU code.
However, the CUDA kernel nested 3 loops code is, well, significantly slower than the CPU version.
I did not find any examples for CUDA kernel nested loops searching through the CUDA documentation or the Julia Discourse, so I would like to ask what I’m doing wrong and what is the recommend method.
julia> include("test_CUDA_v1.jl")
run_CPU!
26.030 ms (62 allocations: 7.84 KiB)
z[1:5, 1, 1] = ComplexF64[-1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im]
run_GPU!
1.366 ms (119 allocations: 3.72 KiB)
z[1:5, 1, 1] = ComplexF64[-1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im]
Run GPU kernel: test_nested_loops_GPU!
2.198 s (33 allocations: 592 bytes)
z[1:5, 1, 1] = ComplexF64[-1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im]
# test_CUDA_v1.jl
using BenchmarkTools
using CUDA
function run_CPU!(
z::Array{Complex{Float64}, 3},
y::Array{Complex{Float64}, 3},
x::Array{Complex{Float64}, 3},
n_rows::Int,
n_cols::Int,
n_slcs::Int
)
@inbounds Threads.@threads for m in 1:n_slcs
for l in 1:n_cols
for k in 1:n_rows
z[k, l, m] = y[k, l, m] * x[k, l, m]
end
end
end
return nothing
end
function run_GPU!(
z_cu::CuArray{Complex{Float32}, 3},
y_cu::CuArray{Complex{Float32}, 3},
x_cu::CuArray{Complex{Float32}, 3}
)
CUDA.@sync @inbounds z_cu .= y_cu .* x_cu
return nothing
end
# GPU kernal
function nested_loops_GPU!(
z_cu,
y_cu,
x_cu
)
# X axis
index_x = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride_x = gridDim().x * blockDim().x
# Y axis
index_y = (blockIdx().y - Int32(1)) * blockDim().y + threadIdx().y
stride_y = gridDim().y * blockDim().y
# Z axis
index_z = (blockIdx().z - Int32(1)) * blockDim().z + threadIdx().z
stride_z = gridDim().z * blockDim().z
k = index_z
while k <= size(z_cu, 3)
j = index_y
while j <= size(z_cu, 2)
i = index_x
while i <= size(z_cu, 1)
@inbounds z_cu[i, j, k] = y_cu[i, j, k] * x_cu[i, j, k]
i += stride_x
end
j += stride_y
end
k += stride_z
end
return nothing
end
function test_nested_loops_GPU!(
z_cu::CuArray{Complex{Float32}, 3},
y_cu::CuArray{Complex{Float32}, 3},
x_cu::CuArray{Complex{Float32}, 3}
)
kernel =
@cuda launch=false nested_loops_GPU!(
z_cu,
y_cu,
x_cu
)
config = launch_configuration(kernel.fun)
threads = min(length(z_cu), config.threads)
blocks = cld(length(z_cu), threads)
# println("threads: ", threads)
# println("blocks: ", blocks)
CUDA.@sync begin
kernel(
z_cu,
y_cu,
x_cu;
threads,
blocks
)
end
end
function main()
n_rows = 256
n_cols = 256
n_slcs = 192
x = fill(1.0 + 1.0im, n_rows, n_cols, n_slcs)
y = fill(1.0 + 2.0im, n_rows, n_cols, n_slcs)
z = zeros(Complex{Float64}, n_rows, n_cols, n_slcs)
println("run_CPU!")
@btime run_CPU!(
$z,
$y,
$x,
$n_rows,
$n_cols,
$n_slcs
)
println("")
@show z[1:5, 1, 1]
println("")
fill!(z, zero(eltype(z)))
# CuArrays
x_cu = CuArray{Complex{Float32}, 3}(x)
y_cu = CuArray{Complex{Float32}, 3}(y)
z_cu = CuArray{Complex{Float32}, 3}(z)
println("run_GPU!")
@btime run_GPU!(
$z_cu,
$y_cu,
$x_cu
)
println("")
z = Array{Complex{Float64}, 3}(z_cu)
@show z[1:5, 1, 1]
println("")
fill!(z, zero(eltype(z)))
println("Run GPU kernel: test_nested_loops_GPU!")
@btime test_nested_loops_GPU!(
$z_cu,
$y_cu,
$x_cu
)
println("")
z = Array{Complex{Float64}, 3}(z_cu)
@show z[1:5, 1, 1]
println("")
end
begin
main()
end