I have a kernel I wrote using CUDA.jl. It gets the correct answer, but is half as fast as the equivalent c++/cuda kernel I coded up. I was hoping someone could look at it and see if there are any optimizations to be done. Is it even resonable to assume similar performance to a c++ cuda kernel?

```
@views function mykernel(
v0_range::CUDA.CuDeviceArray{ComplexF32,3,1}, # (1024,500,500)
tr_array::CUDA.CuDeviceArray{Float32,3,1},# (3, 500, 500)
voxel_list::CUDA.CuDeviceArray{Float32,2,1}, # (3, 125000)
v1::CUDA.CuDeviceArray{ComplexF32,1}, # (125000)
kc::Float32,
range_res::Float32,
range_max::Float32,
bwinx::Bool,
winx::CUDA.CuDeviceArray{Float32,1}, #(500)
bwiny::Bool,
winy::CUDA.CuDeviceArray{Float32,1}) # (500)
@inbounds begin
index = (blockIdx().x - 1CUDA.i32) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
for i = index:stride:size(voxel_list, 2)
tempsum = ComplexF32(0)
# integrate along trajectories in the range domain
for tr_yi = 1:size(tr_array, 3)
for tr_xi = 1:size(tr_array, 2)
weight = 1.0f0
if bwinx
weight *= winx[tr_xi]
end
if bwiny
weight *= winy[tr_yi]
end
p1 = voxel_list[:, i]
p2 = tr_array[:, tr_xi, tr_yi]
r_theory = sqrt((p1[1] - p2[1])^2 + (p1[2] - p2[2])^2 + (p1[3] - p2[3])^2)
if r_theory < range_max
range_bin_float = r_theory / range_res + 1
range_bin_int = trunc(Int, range_bin_float)
delta = range_bin_float - range_bin_int
pp0 = v0_range[range_bin_int, tr_xi, tr_yi] * weight
pp1 = v0_range[range_bin_int+1, tr_xi, tr_yi] * weight
interp_range_respose = pp0 * (1.0f0 - delta) + pp1 * delta
temp = interp_range_respose * exp(+2.0f0im * kc * r_theory)
tempsum += temp
end
end
end
v1[i] = tempsum
end
end
return
end
```