I have the following code, which has very nice performance on the CPU:
using LoopVectorization: @turbo
function dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu)
@turbo thread=true for i in eachindex(out_cpu)
val = typemin(eltype(out_cpu))
for k in eachindex(ker_cpu)
val = max(val, (inp_cpu[i-1+k] + ker_cpu[k]))
end
out_cpu[i] = val
end
return out_cpu
end
How to implement it in CUDA.jl in a fast way? I tried the following:
using CUDA
function dilation_cuda_kernel_naive!(
out::AbstractVector,
inp::AbstractVector,
ker::AbstractVector)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
for iout = index:stride:length(out)
val = typemin(eltype(out))
for iker in eachindex(ker)
@inbounds val = max(val, (inp[iout + iker-1] + ker[iker]))
end
@inbounds out[iout] = val
end
return nothing
end
function dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu)
kernel = CUDA.@cuda launch=false dilation_cuda_kernel_naive!(out_gpu, inp_gpu, ker_gpu)
config = CUDA.launch_configuration(kernel.fun)
threads = min(length(out_gpu), config.threads)
blocks = cld(length(out_gpu), threads)
CUDA.@sync begin
kernel(out_gpu, inp_gpu, ker_gpu; threads, blocks)
end
return out_gpu
end
Here are the results, LoopVectorization blows my naive CUDA version out of the water.
T = Float32
# T = UInt16
out_cpu = rand(T, 2^20)
ker_cpu = rand(T, 2^14)
inp_cpu = rand(T, length(out_cpu) + length(ker_cpu) - 1)
out_gpu = CuArray(out_cpu)
ker_gpu = CuArray(ker_cpu)
inp_gpu = CuArray(inp_cpu)
res_gpu = dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu)
res_cpu = dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu)
@assert res_cpu ≈ Array(res_gpu)
@showtime dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu)
@showtime dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu)
nothing
dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu): 0.134039 seconds (60 allocations: 3.516 KiB)
dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu): 0.050131 seconds (3 allocations: 96 bytes)