Thanks for your help.
I got it down to about ~30% slower than the c++ kernel. Which is much better. Perhaps with some more work I could get it about equal.
I think there is a bug with cis and sincos. looking at the LLVM IR there are some conversion to and from double that were slowing things down. calculating cos and sin independently and then makeing it complex fixed the issue.
here is the IR from cis in my kernel
__internal_trig_reduction_slowpath.exit.i.i.i: ; preds = %164, %153
%hi.i.i.i.1.i = phi i32 [ %169, %164 ], [ %159, %153 ]
%lo.i.i.i.0.i = phi i32 [ %175, %164 ], [ %163, %153 ]
%176 = lshr i32 %hi.i.i.i.1.i, 30
%177 = call i32 @llvm.fshl.i32(i32 %hi.i.i.i.1.i, i32 %lo.i.i.i.0.i, i32 2) #4
%178 = shl i32 %lo.i.i.i.0.i, 2
%179 = lshr i32 %177, 31
%180 = add nuw nsw i32 %179, %176
%.not22.i = icmp eq i32 %154, 0
%181 = sub nsw i32 0, %180
%spec.select.i = select i1 %.not22.i, i32 %180, i32 %181
%.not23.i = icmp sgt i32 %177, -1
%182 = xor i32 %154, -2147483648
%s.i.i.i.0.i = select i1 %.not23.i, i32 %154, i32 %182
%not..not23.i = xor i1 %.not23.i, true
%183 = sext i1 %not..not23.i to i32
%hi.i.i.i.2.i = xor i32 %177, %183
%lo.i.i.i.1.i = xor i32 %178, %183
%184 = zext i32 %hi.i.i.i.2.i to i64
%185 = shl nuw i64 %184, 32
%186 = zext i32 %lo.i.i.i.1.i to i64
%187 = or i64 %185, %186
%188 = sitofp i64 %187 to double
%189 = fmul double %188, 0x3BF921FB54442D19
%190 = fptrunc double %189 to float
%.not25.i = icmp eq i32 %s.i.i.i.0.i, 0
%191 = fneg float %190
%spec.select1.i = select i1 %.not25.i, float %190, float %191
br label %__internal_trig_reduction_kernel.exit.i.i
my final kernel is:
@fastmath @views @muladd function mykernel(
v0_range::CUDA.CuDeviceArray{ComplexF32,3,1},
tr_array::CUDA.CuDeviceArray{Float32,3,1},
voxel_list::CUDA.CuDeviceArray{Float32,2,1},
v1::CUDA.CuDeviceArray{ComplexF32,1},
kc::Float32,
range_res::Float32,
range_max::Float32,
bwinx::Bool,
winx::CUDA.CuDeviceArray{Float32,1},
bwiny::Bool,
winy::CUDA.CuDeviceArray{Float32,1})
@inbounds begin
index = (blockIdx().x - 1CUDA.i32) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
for i = map(Int32, index:stride:size(voxel_list, 2))
tempsum = ComplexF32(0)
for tr_yi = map(Int32, 1:size(tr_array, 3))
for tr_xi = map(Int32, 1:size(tr_array, 2))
weight = 1.0f0
if bwinx
weight *= winx[tr_xi]
end
if bwiny
weight *= winy[tr_yi]
end
p1 = voxel_list[:, i]
p2 = tr_array[:, tr_xi, tr_yi]
r_theory = sqrt((p1[1] - p2[1])^2 + (p1[2] - p2[2])^2 + (p1[3] - p2[3])^2)
if r_theory < range_max
range_bin_float = r_theory / range_res + 1CUDA.i32
range_bin_int = unsafe_trunc(Int32, range_bin_float)
delta = range_bin_float - range_bin_int
pp0 = v0_range[range_bin_int, tr_xi, tr_yi] * weight
pp1 = v0_range[range_bin_int+1CUDA.i32, tr_xi, tr_yi] * weight
interp_range_respose = pp0 * (1.0f0 - delta) + pp1 * delta
a = cos(2.0f0 * kc * r_theory)
b = sin(2.0f0 * kc * r_theory)
temp = interp_range_respose * complex(a, b)
tempsum = tempsum + temp
end
end
end
v1[i] = tempsum
end
end
return
end