Complex vector math performance: StructArray of CuArrays vs CuArrays

Hi there! I’m benchmarking a GPU implementation of a function performing downconversion of a signal given a carrier signal replica. The original form of the function in the library looks like this:

julia> function downconvert!(
           downconverted_signal::StructArray,
           signal::StructArray,
           carrier_replica::StructArray,
           start_sample::Integer,
           num_samples_left::Integer
       )
           for i = start_sample:num_samples_left + start_sample - 1
               downconverted_signal.re[i] = signal.re[i] * carrier.re[i] + signal.im[i] * carrier.im[i]
               downconverted_signal.im[i] = signal.im[i] * carrier.re[i] - signal.re[i] * carrier.im[i]
           end
           return downconverted_signal
       end

I’m trying out CuArrays to potentially better the performance. It doesn’t matter much if the downconversion gets performed in a specific range in the signal. Consider the for loop irrelevant. This is what I came up with:

julia> function downconvert2!(
           downconverted_signal::CuArray{Complex{Float32}},
           signal::CuArray{Complex{Float32}},
           carrier::CuArray{Complex{Float32}},
           start_sample::Integer,
           num_samples_left::Integer
       )
           @. downconverted_signal = (real(signal) * real(carrier) + imag(signal) * imag(carrier)) 
		      + 1im * (imag(signal) * real(carrier) - real(signal) * imag(carrier))
           return downconverted_signal
       end

julia> @benchmark downconvert2!(gpu_downconverted_signal, gpu_signal, gpu_carrier, 1, 2500)
BenchmarkTools.Trial:
  memory estimate:  7.34 KiB
  allocs estimate:  118
  --------------
  minimum time:     87.460 μs (0.00% GC)
  median time:      100.357 μs (0.00% GC)
  mean time:        104.403 μs (0.86% GC)
  maximum time:     9.551 ms (94.39% GC)
  --------------
  samples:          10000
  evals/sample:     1

To fit the original style I have then packed the CuArrays into a StructArray. This is quite advantageous as the CPU functions depend on the signals being kept in a StructArray. It eases the mutliple dispatch declarations. This is also the reason why unused variables are still kept as parameters.

julia> function downconvert2!(
           downconverted_signal::StructArray,
           signal::StructArray,
           carrier::StructArray,
           start_sample::Integer,
           num_samples_left::Integer
       )
           @. downconverted_signal.re = signal.re * carrier.re + signal.im * carrier.im
           @. downconverted_signal.im = signal.im * carrier.re - signal.re * carrier.im
           return downconverted_signal
       end

julia> @benchmark downconvert2!(s_gpu_downconverted_signal, s_gpu_signal, s_gpu_carrier, 1, 2500)
BenchmarkTools.Trial:
  memory estimate:  7.72 KiB
  allocs estimate:  162
  --------------
  minimum time:     145.736 μs (0.00% GC)
  median time:      161.017 μs (0.00% GC)
  mean time:        166.046 μs (0.72% GC)
  maximum time:     12.791 ms (93.01% GC)
  --------------
  samples:          10000
  evals/sample:     1

As seen in the results StructArray of CuArrays is a bit slower on average. Is there any way of making it faster? Am I missing something? Thanks!

Edit 1: Corrected variable names

1 Like

Below are the variables used for testing if you want to have a go yourself.

start_sample = 1 # arbitrary
num_samples = 2500
gpu_downconverted_signal = CuArrays.fill(Complex{Float32}(0), 2500)
s_gpu_downconverted_signal = StructArray{ComplexF32}((real(gpu_downconverted_signal),imag(gpu_downconverted_signal)))
phases = 2π * (1:2500) * 1000 / 2.5e6
gpu_signal = CuArray{Complex{Float32}}(cos.(phases) + 1im * sin.(phases))
s_gpu_signal = StructArray{Complex{Float32}}((real(gpu_signal),imag(gpu_signal)))
gpu_carrier = copy(gpu_signal)
s_gpu_carrier = StructArray{Complex{Float32}}((real(gpu_carrier),imag(gpu_carrier)))

Below is another piece of code which I’m testing. It is generating a local carrier wave replica from a given carrier frequency and other parameters. These are my variables:

gpucarrier = CuArrays.fill(ComplexF32(0), 2500)
2500-element CuArray{Complex{Float32},1,Nothing}:
 0.0f0 + 0.0f0im
       ⋮

cpucarrier = Array(gpucarrier)
2500-element Array{Complex{Float32},1}:
 0.0f0 + 0.0f0im
       ⋮

sgpucarrier = StructArray{ComplexF32}( (real(gpucarrier),imag(gpucarrier)) )
2500-element StructArray(::CuArray{Float32,1,Nothing}, ::CuArray{Float32,1,Nothing}) with eltype Complex{Float32}:
 0.0f0 + 0.0f0im
       ⋮

scpucarrier = StructArray{ComplexF32}((real(cpucarrier),imag(cpucarrier)))
2500-element StructArray(::Array{Float32,1}, ::Array{Float32,1}) with eltype Complex{Float32}:
 0.0f0 + 0.0f0im
       ⋮

This is a straightforward CPU implementation

function gen_carrier_replica!(
           carrier_replica::Array{Complex{Float32}},
           carrier_frequency,
           sampling_frequency,
           start_phase,
           carrier_amplitude_power,
           start_sample,
           num_samples
       )
           @. carrier_replica = cis(2pi * (1:num_samples) * carrier_frequency / sampling_frequency + start_phase)
           return carrier_replica
       end

This is the GPU function:

function gen_carrier_replica!(
           carrier_replica::CuArray{Complex{Float32}},
           # the rest is identical

This is the version using StructArrays.jl:

function gen_carrier_replica!(
           carrier_replica::StructArray{Complex{Float32}},
           carrier_frequency,
           sampling_frequency,
           start_phase,
           carrier_amplitude_power,
           start_sample,
           num_samples
       )
           z = CuArray{Complex{Float32}}(undef, num_samples)
           @. z = cis(2pi * (1:num_samples) * carrier_frequency / sampling_frequency + start_phase )
           carrier_replica.re .= real(z)
           carrier_replica.im .= imag(z)
           return carrier_replica
       end

The benchmark results:

julia> @benchmark gen_carrier_replica!(cpucarrier, 1500, 2.5e6, 0.25, 0, 0, 2500)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     191.307 μs (0.00% GC)
  median time:      191.787 μs (0.00% GC)
  mean time:        200.407 μs (0.00% GC)
  maximum time:     813.359 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark gen_carrier_replica!(gpucarrier, 1500, 2.5e6, 0.25, 0, 0, 2500)
BenchmarkTools.Trial:
  memory estimate:  3.13 KiB
  allocs estimate:  76
  --------------
  minimum time:     66.467 μs (0.00% GC)
  median time:      69.540 μs (0.00% GC)
  mean time:        72.649 μs (0.00% GC)
  maximum time:     362.965 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark gen_carrier_replica!(scpucarrier, 1500, 2.5e6, 0.25, 0, 0, 2500)
BenchmarkTools.Trial:
  memory estimate:  7.61 KiB
  allocs estimate:  227
  --------------
  minimum time:     270.992 μs (0.00% GC)
  median time:      283.440 μs (0.00% GC)
  mean time:        296.985 μs (0.62% GC)
  maximum time:     50.907 ms (36.42% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark gen_carrier_replica!(sgpucarrier, 1500, 2.5e6, 0.25, 0, 0, 2500)
BenchmarkTools.Trial:
  memory estimate:  11.67 KiB
  allocs estimate:  359
  --------------
  minimum time:     317.618 μs (0.00% GC)
  median time:      338.452 μs (0.00% GC)
  mean time:        348.656 μs (0.48% GC)
  maximum time:     39.897 ms (41.74% GC)
  --------------
  samples:          10000
  evals/sample:     1

TLDR: Using the GPU yields a ~2.8-fold speedup compared to basic arrays. StructArray of CuArrays remains the slowest.