Can CUDA beat LoopVectorization/make this CUDA code faster?

I have the following code, which has very nice performance on the CPU:

using LoopVectorization: @turbo

function dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu)
    @turbo thread=true for i in eachindex(out_cpu)
        val = typemin(eltype(out_cpu))
        for k in eachindex(ker_cpu)
            val = max(val, (inp_cpu[i-1+k] + ker_cpu[k]))
        end
        out_cpu[i] = val
    end
    return out_cpu
end

How to implement it in CUDA.jl in a fast way? I tried the following:

using CUDA

function dilation_cuda_kernel_naive!(
        out::AbstractVector, 
        inp::AbstractVector, 
        ker::AbstractVector)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = gridDim().x * blockDim().x
    for iout = index:stride:length(out)
        val = typemin(eltype(out))
        for iker in eachindex(ker)
            @inbounds val = max(val, (inp[iout + iker-1] + ker[iker]))
        end
        @inbounds out[iout] = val
    end
    return nothing
end

function dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu)
    kernel = CUDA.@cuda launch=false dilation_cuda_kernel_naive!(out_gpu, inp_gpu, ker_gpu)
    config = CUDA.launch_configuration(kernel.fun)
    threads = min(length(out_gpu), config.threads)
    blocks = cld(length(out_gpu), threads)
    CUDA.@sync begin
        kernel(out_gpu, inp_gpu, ker_gpu; threads, blocks)
    end
    return out_gpu
end

Here are the results, LoopVectorization blows my naive CUDA version out of the water.

T = Float32
# T = UInt16
out_cpu = rand(T, 2^20)
ker_cpu = rand(T, 2^14)
inp_cpu = rand(T, length(out_cpu) + length(ker_cpu) - 1)
out_gpu = CuArray(out_cpu)
ker_gpu = CuArray(ker_cpu)
inp_gpu = CuArray(inp_cpu)

res_gpu = dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu)
res_cpu = dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu)
@assert res_cpu ≈ Array(res_gpu)
@showtime dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu)
@showtime dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu)
nothing
dilation_cuda_naive!(out_gpu, inp_gpu, ker_gpu): 0.134039 seconds (60 allocations: 3.516 KiB)
dilation_loop_vec!(out_cpu, inp_cpu, ker_cpu): 0.050131 seconds (3 allocations: 96 bytes)