Sorry for the delayed response. Here is an example, although probably not minimal, demonstrating, at least for me, that complex values are not copied to global memory. All values in the final array should be 1. I’ve resorted storing the real and imaginary parts as a floating point array and then converting them to complex values on the host.
– Paul
using CUDA
function reduce!(gtemp)
temp = @cuStaticSharedMem(Float64, 512)
b::UInt32 = blockIdx().x
t::UInt32 = threadIdx().x
j1::UInt32 = (b - 1)*(2*blockDim().x) + t
j2::UInt32 = (b - 1)*(2*blockDim().x) + t + blockDim().x
temp[t] = gtemp[j1] + gtemp[j2]
sync_threads()
# Sum (reduce) observations in each block
s::UInt32 = div(blockDim().x, 2)
while s > 32
if t <= s
temp[t] += temp[t+s]
end
s >>= 1
sync_threads()
end
temp[t] += temp[t+32]
temp[t] += temp[t+16]
temp[t] += temp[t+ 8]
temp[t] += temp[t+ 4]
temp[t] += temp[t+ 2]
temp[t] += temp[t+ 1]
if t == 1
gtemp[b] = temp[t]
end
return
end
function reduceA!(gtemp)
# temp = @cuDynamicSharedMem(Float64, blockDim().x)
temp = @cuStaticSharedMem(Float64, 512)
b::UInt32 = blockIdx().x
t::UInt32 = threadIdx().x
j1::UInt32 = (b - 1)*(2*blockDim().x) + t
j2::UInt32 = (b - 1)*(2*blockDim().x) + t + blockDim().x
temp[t] = gtemp[j1] + gtemp[j2]
sync_threads()
# Sum (reduce) observations in each block
s::UInt32 = div(blockDim().x, 2)
while s > 32
if t <= s
temp[t] += temp[t+s]
end
s >>= 1
sync_threads()
end
if blockDim().x > 32 && t <= 32
temp[t] += temp[t+32]
end
if blockDim().x > 16 && t <= 16
temp[t] += temp[t+16]
end
if blockDim().x > 8 && t <= 8
temp[t] += temp[t+ 8]
end
if blockDim().x > 4 && t <= 4
temp[t] += temp[t+ 4]
end
if blockDim().x > 2 && t <= 2
temp[t] += temp[t+ 2]
end
if blockDim().x > 1 && t <= 1
temp[t] += temp[t+ 1]
end
if t == 1
gtemp[b] = temp[t]
end
return
end
function childAdd1!(gtemp, data1) #, data2, data3)
b::UInt32 = blockIdx().x
t::UInt32 = threadIdx().x
j1::UInt32 = (b - 1)*(2*blockDim().x) + t
j2::UInt32 = (b - 1)*(2*blockDim().x) + t + blockDim().x
j3::UInt32 = (b - 1)*blockDim().x + t
gtemp[j3] = data1[j1] + data1[j2]
return
end
function parentKern!(ctemp, gtemp, data1, data2, data3)
x = (blockIdx().x - 1)*blockDim().x + threadIdx().x
y = (blockIdx().y - 1)*blockDim().y + threadIdx().y
threads::UInt32 = 256
T1::UInt32 = threads
B1::UInt32 = div(length(data1), 2*256)
T2::UInt32 = threads
B2::UInt32 = div(length(data1), 4*256)
temp1 = view(gtemp, :, 1, x, y)
@cuda dynamic=true threads=T1 blocks=B1 childAdd1!(temp1, data1)
@cuda dynamic=true threads=T2 blocks=B2 reduce!(temp1)
@cuda dynamic=true threads=div(B2,2) blocks=1 reduceA!(view(temp1,1:B2))
temp2 = view(gtemp, :, 2, x, y)
@cuda dynamic=true threads=T1 blocks=B1 childAdd1!(temp2, data2)
@cuda dynamic=true threads=T2 blocks=B2 reduce!(temp2)
@cuda dynamic=true threads=div(B2,2) blocks=1 reduceA!(view(temp2,1:B2))
temp3 = view(gtemp, :, 3, x, y)
@cuda dynamic=true threads=T1 blocks=B1 childAdd1!(temp3, data3)
@cuda dynamic=true threads=T2 blocks=B2 reduce!(temp3)
@cuda dynamic=true threads=div(B2,2) blocks=1 reduceA!(view(temp3,1:B2))
ctemp = (temp1[1] + temp2[1]*im)/temp3[1]
return
end
ctemp = CuArray(zeros(Complex{Float64}, 4, 8))
gtemp = CuArray(zeros(Float64, 81024, 3, 4, 8))
data1 = CuArray(ones(Float64, 161024))
data2 = CuArray(ones(Float64, 161024))
data3 = CuArray(ones(Float64, 161024))
@cuda blocks=(1,1) threads=(4,8) parentKern!(ctemp, gtemp, data1, data2, data3)
println(Array(gtemp[1,:,:,:]))
println(Array(ctemp))