Dot-product of CuArray views is slow

using CUDA
using LinearAlgebra

function old(A::AbstractMatrix)
    d1, d2 = size(A)
    out = Array{eltype(A)}(undef, d2, d2)
    for i in 1:d2
        for j in 1:i
            @inbounds @views out[i,j] = A[:,i] ⋅ A[:,j]
            @inbounds out[j,i] = out[i,j]'
        end
    end
    return out
end

function new(A::AbstractMatrix)
    d1, d2 = size(A)
    out = similar(A, d2, d2)
    for i in 1:d2
        for j in 1:d2
            vec_out = reshape(@view(out[i,j]), 1)   # CUDA.jl's mul! doesn't like 0d
            mul!(vec_out, @view(A[:,i])', @view(A[:,j]))
        end
    end
    return out
end
julia> A = CUDA.rand(ComplexF32, 2^18, 2^4)

julia> @benchmark CUDA.@sync old($A)
BenchmarkTools.Trial: 
  memory estimate:  48.97 KiB
  allocs estimate:  2319
  --------------
  minimum time:     280.837 ms (0.00% GC)
  median time:      310.700 ms (0.00% GC)
  mean time:        306.896 ms (0.00% GC)
  maximum time:     310.896 ms (0.00% GC)
  --------------
  samples:          17
  evals/sample:     1

julia> @benchmark CUDA.@sync new($A)
BenchmarkTools.Trial: 
  memory estimate:  367.16 KiB
  allocs estimate:  16580
  --------------
  minimum time:     5.712 ms (0.00% GC)
  median time:      7.185 ms (0.00% GC)
  mean time:        7.666 ms (1.52% GC)
  maximum time:     47.268 ms (24.64% GC)
  --------------
  samples:          651
  evals/sample:     1

Use the available tools, i.e. the profiler, to figure out what’s happening on your system.

1 Like