Indexing 2D CuArray

I’m learning to use CUDA.jl based on the tutorial, and trying to write a kernel for 2D array.

using CUDA

function divide0!(y, a, b)
    for j = 1:size(y, 2)
        for i = 1:size(y, 1)
            @inbounds y[i, j] = a[i, j] / b[i]
        end
    end

    return nothing
end

function divide1!(y, a, b)
    idx = threadIdx().x
    idy = threadIdx().y
    strx = blockDim().x
    stry = blockDim().y
    for j = idy:stry:size(y, 2)
        for i = idx:strx:size(y, 1)
            @inbounds y[i, j] = a[i, j] / b[i]
        end
    end

    return nothing
end

function divide2!(y, a, b)
    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    idy = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    for j = idy
        for i = idx
            @inbounds y[i, j] = a[i, j] / b[i]
        end
    end

    return nothing
end

a = rand(32, 16) |> CuArray
b = rand(32) |> CuArray

y0 = zero(a)
y1 = zero(a)
y2 = zero(a)

@cuda divide0!(y0, a, b)
@cuda threads=32 divide1!(y1, a, b)
@cuda threads=32 blocks=16 divide2!(y2, a, b)

However, I found divide2 doesn’t give a same result as 0 and 1.

julia> y0 == y1
true

julia> y1 == y2
false

Do I make any silly mistakes here?

neither is a range. you could’ve figured it out by printing idx and idy.

With CartesianIndices you can iterate using the linear index and at the same time have access to the indices along each dimension:

function divide2!(y, a, b)
    id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x

    Nx, Ny = size(y)

    cind = CartesianIndices((Nx, Ny))

    for k=id:stride:Nx*Ny
        i = cind[k][1]
        j = cind[k][2]
        @inbounds y[i, j] = a[i, j] / b[i]
    end

    return nothing
end
1 Like

Following the above codes, this way also works.

function divide2!(y, a, b)
    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    idy = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    strx = blockDim().x * gridDim().x
    stry = blockDim().y * gridDim().y
    Nx, Ny = size(y)

    for j = idy:stry:Ny
        for i = idx:strx:Nx
            @inbounds y[i, j] = a[i, j] / b[i]
        end
    end

    return nothing
end

Note it here for reference.