I try to read performance tips ) (Performance Tips · CUDA.jl)
and can’t understand what is the point of this example, because seems while
executed only once.
function gpu_add5!(y, x)
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
i = index
while i <= length(y)
@inbounds y[i] += x[i]
i += stride
end
return
end
function bench_gpu5!(y, x)
kernel = @cuda launch=false gpu_add5!(y, x)
config = launch_configuration(kernel.fun)
threads = min(length(y), config.threads)
blocks = cld(length(y), threads)
CUDA.@sync kernel(y, x; threads, blocks)
end
for example:
using CUDA
function test!(x)
gpukernel = @cuda launch=false kernel_test!(x)
config = launch_configuration(gpukernel.fun)
Nx = length(x)
maxThreads = config.threads
maxThreads = 3
Tx = min(maxThreads, Nx)
Bx = cld(Nx, Tx)
CUDA.@sync gpukernel(x; threads = Tx, blocks = Bx)
end
function kernel_test!(x)
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
i = index
while i <= length(x)
@cuprintln "i = $i, index = $index, threadIdx: $(threadIdx().x), blockIdx $(blockIdx().x), blockDim $(blockDim().x)"
i += stride
end
return nothing
end
test!(CUDA.zeros(10))
give:
i = 10, index = 10, threadIdx: 1, blockIdx 4, blockDim 3
i = 7, index = 7, threadIdx: 1, blockIdx 3, blockDim 3
i = 8, index = 8, threadIdx: 2, blockIdx 3, blockDim 3
i = 9, index = 9, threadIdx: 3, blockIdx 3, blockDim 3
i = 1, index = 1, threadIdx: 1, blockIdx 1, blockDim 3
i = 2, index = 2, threadIdx: 2, blockIdx 1, blockDim 3
i = 3, index = 3, threadIdx: 3, blockIdx 1, blockDim 3
i = 4, index = 4, threadIdx: 1, blockIdx 2, blockDim 3
i = 5, index = 5, threadIdx: 2, blockIdx 2, blockDim 3
i = 6, index = 6, threadIdx: 3, blockIdx 2, blockDim 3
i
always == index
may be this was idea of example with gpu_add5!
function test2!(x)
gpukernel = @cuda launch=false kernel_test2!(x)
config = launch_configuration(gpukernel.fun)
Nx = length(x)
maxThreads = config.threads
maxThreads = 3
Tx = min(maxThreads, Nx)
CUDA.@sync gpukernel(x; threads = Tx, blocks = 1)
end
function kernel_test2!(x)
index = threadIdx().x
stride = blockDim().x
i = index
while i <= length(x)
@cuprintln "i = $i, index = $index, threadIdx: $(threadIdx().x), blockIdx $(blockIdx().x), blockDim $(blockDim().x)"
i += stride
end
return nothing
end
test2!(CUDA.zeros(10))
that give:
i = 1, index = 1, threadIdx: 1, blockIdx 1, blockDim 3
i = 2, index = 2, threadIdx: 2, blockIdx 1, blockDim 3
i = 3, index = 3, threadIdx: 3, blockIdx 1, blockDim 3
i = 4, index = 1, threadIdx: 1, blockIdx 1, blockDim 3
i = 5, index = 2, threadIdx: 2, blockIdx 1, blockDim 3
i = 6, index = 3, threadIdx: 3, blockIdx 1, blockDim 3
i = 7, index = 1, threadIdx: 1, blockIdx 1, blockDim 3
i = 8, index = 2, threadIdx: 2, blockIdx 1, blockDim 3
i = 9, index = 3, threadIdx: 3, blockIdx 1, blockDim 3
i = 10, index = 1, threadIdx: 1, blockIdx 1, blockDim 3