CUDA | nested loops kernel

Hello,

As part of learning to use CUDA in Julia, I wrote the following MWE
to compare threaded CPU, CUDA array programming, and CUDA kernel programming performance for a simple element by element array multiplication.

As expected, the CUDA array programming code is significantly faster than the nested 3 loops threaded CPU code.

However, the CUDA kernel nested 3 loops code is, well, significantly slower than the CPU version.

I did not find any examples for CUDA kernel nested loops searching through the CUDA documentation or the Julia Discourse, so I would like to ask what I’m doing wrong and what is the recommend method.

julia> include("test_CUDA_v1.jl")
run_CPU!
  26.030 ms (62 allocations: 7.84 KiB)

z[1:5, 1, 1] = ComplexF64[-1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im]

run_GPU!
  1.366 ms (119 allocations: 3.72 KiB)

z[1:5, 1, 1] = ComplexF64[-1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im]

Run GPU kernel: test_nested_loops_GPU!
  2.198 s (33 allocations: 592 bytes)

z[1:5, 1, 1] = ComplexF64[-1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im, -1.0 + 3.0im]
# test_CUDA_v1.jl

using BenchmarkTools
using CUDA

function run_CPU!(
            z::Array{Complex{Float64}, 3},
            y::Array{Complex{Float64}, 3},
            x::Array{Complex{Float64}, 3},
            n_rows::Int,
            n_cols::Int,
            n_slcs::Int
        )

    @inbounds Threads.@threads for m in 1:n_slcs 
        for l in 1:n_cols
            for k in 1:n_rows
                z[k, l, m] = y[k, l, m] * x[k, l, m]
            end
        end
    end

    return nothing
end

function run_GPU!(            
            z_cu::CuArray{Complex{Float32}, 3},
            y_cu::CuArray{Complex{Float32}, 3},
            x_cu::CuArray{Complex{Float32}, 3}
        )

    CUDA.@sync @inbounds z_cu .= y_cu .* x_cu

    return nothing
end

# GPU kernal
function nested_loops_GPU!(
            z_cu,
            y_cu,
            x_cu
        )
    # X axis
    index_x = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
    stride_x = gridDim().x * blockDim().x

    # Y axis
    index_y = (blockIdx().y - Int32(1)) * blockDim().y + threadIdx().y
    stride_y = gridDim().y * blockDim().y

    # Z axis
    index_z = (blockIdx().z - Int32(1)) * blockDim().z + threadIdx().z
    stride_z = gridDim().z * blockDim().z

    k = index_z
    while k <= size(z_cu, 3)

        j = index_y
        while j <= size(z_cu, 2)

            i = index_x
            while i <= size(z_cu, 1)
                
                @inbounds z_cu[i, j, k] = y_cu[i, j, k] * x_cu[i, j, k]
                i += stride_x

            end
            j += stride_y

        end
        k += stride_z

    end

    return nothing
end

function test_nested_loops_GPU!(
            z_cu::CuArray{Complex{Float32}, 3},
            y_cu::CuArray{Complex{Float32}, 3},
            x_cu::CuArray{Complex{Float32}, 3}
        )

    kernel =
        @cuda launch=false nested_loops_GPU!(
                                z_cu,
                                y_cu,
                                x_cu
                            )
    config = launch_configuration(kernel.fun)
    threads = min(length(z_cu), config.threads)
    blocks = cld(length(z_cu), threads)

    # println("threads: ", threads)
    # println("blocks: ", blocks)

    CUDA.@sync begin
        kernel(
            z_cu,
            y_cu,
            x_cu;
            threads,
            blocks
        )
    end
end

function main()
    n_rows = 256
    n_cols = 256
    n_slcs = 192

    x = fill(1.0 + 1.0im, n_rows, n_cols, n_slcs)
    y = fill(1.0 + 2.0im, n_rows, n_cols, n_slcs)
    z = zeros(Complex{Float64}, n_rows, n_cols, n_slcs)

    println("run_CPU!")
    @btime run_CPU!(
        $z,
        $y,
        $x,
        $n_rows,
        $n_cols,
        $n_slcs
    )
    println("")

    @show z[1:5, 1, 1]
    println("")

    fill!(z, zero(eltype(z)))

    # CuArrays
    x_cu = CuArray{Complex{Float32}, 3}(x)
    y_cu = CuArray{Complex{Float32}, 3}(y)
    z_cu = CuArray{Complex{Float32}, 3}(z)

    println("run_GPU!")
    @btime run_GPU!(
        $z_cu,
        $y_cu,
        $x_cu
    )
    println("")

    z = Array{Complex{Float64}, 3}(z_cu)

    @show z[1:5, 1, 1]
    println("")

    fill!(z, zero(eltype(z)))

    println("Run GPU kernel: test_nested_loops_GPU!")
    @btime test_nested_loops_GPU!(        
        $z_cu,
        $y_cu,
        $x_cu
    )
    println("")

    z = Array{Complex{Float64}, 3}(z_cu)

    @show z[1:5, 1, 1]
    println("")

end

begin
    main()
end

You’re launching a 1D grid (scalar threads and blocks), so the y and z indices and dimensions will always be 1. Doesn’t that imply that each thread will fully iterate the entire range of k and j values, massively overlapping the work?

I’d recommend using linear indices (both for the launch and the index/stride calculation in the kernel). If you really want a ND index, you can convert the linear index to a Cartesian one within the kernel (using CartesianIndices(ndarray)[linear_index]).

1 Like