I’m learning to use CUDA.jl based on the tutorial, and trying to write a kernel for 2D array.
using CUDA
function divide0!(y, a, b)
for j = 1:size(y, 2)
for i = 1:size(y, 1)
@inbounds y[i, j] = a[i, j] / b[i]
end
end
return nothing
end
function divide1!(y, a, b)
idx = threadIdx().x
idy = threadIdx().y
strx = blockDim().x
stry = blockDim().y
for j = idy:stry:size(y, 2)
for i = idx:strx:size(y, 1)
@inbounds y[i, j] = a[i, j] / b[i]
end
end
return nothing
end
function divide2!(y, a, b)
idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
idy = (blockIdx().y - 1) * blockDim().y + threadIdx().y
for j = idy
for i = idx
@inbounds y[i, j] = a[i, j] / b[i]
end
end
return nothing
end
a = rand(32, 16) |> CuArray
b = rand(32) |> CuArray
y0 = zero(a)
y1 = zero(a)
y2 = zero(a)
@cuda divide0!(y0, a, b)
@cuda threads=32 divide1!(y1, a, b)
@cuda threads=32 blocks=16 divide2!(y2, a, b)
However, I found divide2
doesn’t give a same result as 0
and 1
.
julia> y0 == y1
true
julia> y1 == y2
false
Do I make any silly mistakes here?