Sorry for the delayed response. Here is an example, although probably not minimal, demonstrating, at least for me, that complex values are not copied to global memory. All values in the final array should be 1. I’ve resorted storing the real and imaginary parts as a floating point array and then converting them to complex values on the host.

– Paul

using CUDA

function reduce!(gtemp)

temp = @cuStaticSharedMem(Float64, 512)

```
b::UInt32 = blockIdx().x
t::UInt32 = threadIdx().x
j1::UInt32 = (b - 1)*(2*blockDim().x) + t
j2::UInt32 = (b - 1)*(2*blockDim().x) + t + blockDim().x
temp[t] = gtemp[j1] + gtemp[j2]
sync_threads()
# Sum (reduce) observations in each block
s::UInt32 = div(blockDim().x, 2)
while s > 32
if t <= s
temp[t] += temp[t+s]
end
s >>= 1
sync_threads()
end
temp[t] += temp[t+32]
temp[t] += temp[t+16]
temp[t] += temp[t+ 8]
temp[t] += temp[t+ 4]
temp[t] += temp[t+ 2]
temp[t] += temp[t+ 1]
if t == 1
gtemp[b] = temp[t]
end
return
```

end

function reduceA!(gtemp)

# temp = @cuDynamicSharedMem(Float64, blockDim().x)

temp = @cuStaticSharedMem(Float64, 512)

```
b::UInt32 = blockIdx().x
t::UInt32 = threadIdx().x
j1::UInt32 = (b - 1)*(2*blockDim().x) + t
j2::UInt32 = (b - 1)*(2*blockDim().x) + t + blockDim().x
temp[t] = gtemp[j1] + gtemp[j2]
sync_threads()
# Sum (reduce) observations in each block
s::UInt32 = div(blockDim().x, 2)
while s > 32
if t <= s
temp[t] += temp[t+s]
end
s >>= 1
sync_threads()
end
if blockDim().x > 32 && t <= 32
temp[t] += temp[t+32]
end
if blockDim().x > 16 && t <= 16
temp[t] += temp[t+16]
end
if blockDim().x > 8 && t <= 8
temp[t] += temp[t+ 8]
end
if blockDim().x > 4 && t <= 4
temp[t] += temp[t+ 4]
end
if blockDim().x > 2 && t <= 2
temp[t] += temp[t+ 2]
end
if blockDim().x > 1 && t <= 1
temp[t] += temp[t+ 1]
end
if t == 1
gtemp[b] = temp[t]
end
return
```

end

function childAdd1!(gtemp, data1) #, data2, data3)

```
b::UInt32 = blockIdx().x
t::UInt32 = threadIdx().x
j1::UInt32 = (b - 1)*(2*blockDim().x) + t
j2::UInt32 = (b - 1)*(2*blockDim().x) + t + blockDim().x
j3::UInt32 = (b - 1)*blockDim().x + t
gtemp[j3] = data1[j1] + data1[j2]
return
```

end

function parentKern!(ctemp, gtemp, data1, data2, data3)

```
x = (blockIdx().x - 1)*blockDim().x + threadIdx().x
y = (blockIdx().y - 1)*blockDim().y + threadIdx().y
threads::UInt32 = 256
T1::UInt32 = threads
B1::UInt32 = div(length(data1), 2*256)
T2::UInt32 = threads
B2::UInt32 = div(length(data1), 4*256)
temp1 = view(gtemp, :, 1, x, y)
@cuda dynamic=true threads=T1 blocks=B1 childAdd1!(temp1, data1)
@cuda dynamic=true threads=T2 blocks=B2 reduce!(temp1)
@cuda dynamic=true threads=div(B2,2) blocks=1 reduceA!(view(temp1,1:B2))
temp2 = view(gtemp, :, 2, x, y)
@cuda dynamic=true threads=T1 blocks=B1 childAdd1!(temp2, data2)
@cuda dynamic=true threads=T2 blocks=B2 reduce!(temp2)
@cuda dynamic=true threads=div(B2,2) blocks=1 reduceA!(view(temp2,1:B2))
temp3 = view(gtemp, :, 3, x, y)
@cuda dynamic=true threads=T1 blocks=B1 childAdd1!(temp3, data3)
@cuda dynamic=true threads=T2 blocks=B2 reduce!(temp3)
@cuda dynamic=true threads=div(B2,2) blocks=1 reduceA!(view(temp3,1:B2))
ctemp = (temp1[1] + temp2[1]*im)/temp3[1]
return
```

end

ctemp = CuArray(zeros(Complex{Float64}, 4, 8))

gtemp = CuArray(zeros(Float64, 8*1024, 3, 4, 8))*

data1 = CuArray(ones(Float64, 161024))

data2 = CuArray(ones(Float64, 16*1024))*

data3 = CuArray(ones(Float64, 161024))

@cuda blocks=(1,1) threads=(4,8) parentKern!(ctemp, gtemp, data1, data2, data3)

println(Array(gtemp[1,:,:,:]))

println(Array(ctemp))