CUDA.jl kernel is half as fast as c++ Kernel

Thanks for your help.

I got it down to about ~30% slower than the c++ kernel. Which is much better. Perhaps with some more work I could get it about equal.

I think there is a bug with cis and sincos. looking at the LLVM IR there are some conversion to and from double that were slowing things down. calculating cos and sin independently and then makeing it complex fixed the issue.

here is the IR from cis in my kernel

__internal_trig_reduction_slowpath.exit.i.i.i:    ; preds = %164, %153
     %hi.i.i.i.1.i = phi i32 [ %169, %164 ], [ %159, %153 ]
     %lo.i.i.i.0.i = phi i32 [ %175, %164 ], [ %163, %153 ]
     %176 = lshr i32 %hi.i.i.i.1.i, 30
     %177 = call i32 @llvm.fshl.i32(i32 %hi.i.i.i.1.i, i32 %lo.i.i.i.0.i, i32 2) #4
     %178 = shl i32 %lo.i.i.i.0.i, 2
     %179 = lshr i32 %177, 31
     %180 = add nuw nsw i32 %179, %176
     %.not22.i = icmp eq i32 %154, 0
     %181 = sub nsw i32 0, %180
     %spec.select.i = select i1 %.not22.i, i32 %180, i32 %181
     %.not23.i = icmp sgt i32 %177, -1
     %182 = xor i32 %154, -2147483648
     %s.i.i.i.0.i = select i1 %.not23.i, i32 %154, i32 %182
     %not..not23.i = xor i1 %.not23.i, true
     %183 = sext i1 %not..not23.i to i32
     %hi.i.i.i.2.i = xor i32 %177, %183
     %lo.i.i.i.1.i = xor i32 %178, %183
     %184 = zext i32 %hi.i.i.i.2.i to i64
     %185 = shl nuw i64 %184, 32
     %186 = zext i32 %lo.i.i.i.1.i to i64
     %187 = or i64 %185, %186
     %188 = sitofp i64 %187 to double
     %189 = fmul double %188, 0x3BF921FB54442D19
     %190 = fptrunc double %189 to float
     %.not25.i = icmp eq i32 %s.i.i.i.0.i, 0
     %191 = fneg float %190
     %spec.select1.i = select i1 %.not25.i, float %190, float %191
     br label %__internal_trig_reduction_kernel.exit.i.i

my final kernel is:

@fastmath @views @muladd function mykernel(
    v0_range::CUDA.CuDeviceArray{ComplexF32,3,1},
    tr_array::CUDA.CuDeviceArray{Float32,3,1},
    voxel_list::CUDA.CuDeviceArray{Float32,2,1},
    v1::CUDA.CuDeviceArray{ComplexF32,1},
    kc::Float32,
    range_res::Float32,
    range_max::Float32,
    bwinx::Bool,
    winx::CUDA.CuDeviceArray{Float32,1},
    bwiny::Bool,
    winy::CUDA.CuDeviceArray{Float32,1})
    @inbounds begin
        index = (blockIdx().x - 1CUDA.i32) * blockDim().x + threadIdx().x
        stride = gridDim().x * blockDim().x
        for i = map(Int32, index:stride:size(voxel_list, 2))
            tempsum = ComplexF32(0)
            for tr_yi = map(Int32, 1:size(tr_array, 3))
                for tr_xi = map(Int32, 1:size(tr_array, 2))
                    weight = 1.0f0
                    if bwinx
                        weight *= winx[tr_xi]
                    end
                    if bwiny
                        weight *= winy[tr_yi]
                    end
                    p1 = voxel_list[:, i]
                    p2 = tr_array[:, tr_xi, tr_yi]
                    r_theory = sqrt((p1[1] - p2[1])^2 + (p1[2] - p2[2])^2 + (p1[3] - p2[3])^2)
                    if r_theory < range_max
                        range_bin_float = r_theory / range_res + 1CUDA.i32
                        range_bin_int = unsafe_trunc(Int32, range_bin_float)
                        delta = range_bin_float - range_bin_int
                        pp0 = v0_range[range_bin_int, tr_xi, tr_yi] * weight
                        pp1 = v0_range[range_bin_int+1CUDA.i32, tr_xi, tr_yi] * weight
                        interp_range_respose = pp0 * (1.0f0 - delta) + pp1 * delta
                        a = cos(2.0f0 * kc * r_theory)
                        b = sin(2.0f0 * kc * r_theory)
                        temp = interp_range_respose * complex(a, b)
                        tempsum = tempsum + temp
                    end
                end
            end
            v1[i] = tempsum
        end
    end
    return
end