Hello,

For a simple multiplication as in the example below I see that the kernel written with KernelAbstractions is almost twice slower than the one written with pure CUDA. Am I doing something wrong, or KA indeed introduces such big overhead?

```
using BenchmarkTools
using CUDA
using CUDAKernels
using KernelAbstractions
# Kernel Abstractions ----------------------------------------------------------
@kernel function mulcab_ka_kernel(C, A, B)
I = @index(Global)
C[I] = A[I] * B[I]
end
function mulcab_ka(device, C, A, B)
kernel = mulcab_ka_kernel(device)
event = kernel(C, A, B, ndrange=size(C))
wait(event)
return nothing
end
# CUDA -------------------------------------------------------------------------
function mulcab_cuda_kernel(C, A, B)
id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
for i=id:stride:length(C)
C[i] = A[i] * B[i]
end
return nothing
end
function mulcab_cuda(C, A, B)
N = length(C)
ckernel = @cuda launch=false mulcab_cuda_kernel(C, A, B)
config = launch_configuration(ckernel.fun)
threads = min(N, config.threads)
blocks = cld(N, threads)
CUDA.@sync ckernel(C, A, B; threads=threads, blocks=blocks)
return nothing
end
# Test -------------------------------------------------------------------------
A = CUDA.ones(1024, 1024) * 2
B = CUDA.ones(1024, 1024) * 3
C = CUDA.zeros(1024, 1024)
@btime mulcab_ka(CUDADevice(), $C, $A, $B)
@assert all(C .== 6)
@btime mulcab_cuda($C, $A, $B)
@assert all(C .== 6)
# 99.187 μs (96 allocations: 4.56 KiB)
# 52.817 μs (5 allocations: 304 bytes)
```