function inv!(scale, nthreads)
nblocks = cld(length(scale), nthreads)
@cuda threads=nthreads blocks=nblocks inv_kernel!(scale)
end
function inv_kernel!(scale)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
@inbounds for i = index:stride:length(scale)
scale[i] = 1 / sqrt(scale[i]))
end
end
Thanks but these example do not seem to cover the keyword groups (the equivalent to blocks in CUDA?). That is, how should I modify your linked example to allow for parallelism?
Here is what I have written. Is that correct / efficient?
function inv!(scale::MtlVector, nthreads::Integer)
nblocks = cld(length(scale), nthreads)
@metal threads=nthreads groups=nblocks inv_kernel!(scale)
end
function inv_kernel!(scale)
i = thread_position_in_grid_1d()
if i <= length(scale)
scale[i] = 1 / sqrt(scale[i])
end
return nothing
end