using CUDA
using LinearAlgebra
function old(A::AbstractMatrix)
d1, d2 = size(A)
out = Array{eltype(A)}(undef, d2, d2)
for i in 1:d2
for j in 1:i
@inbounds @views out[i,j] = A[:,i] ⋅ A[:,j]
@inbounds out[j,i] = out[i,j]'
end
end
return out
end
function new(A::AbstractMatrix)
d1, d2 = size(A)
out = similar(A, d2, d2)
for i in 1:d2
for j in 1:d2
vec_out = reshape(@view(out[i,j]), 1) # CUDA.jl's mul! doesn't like 0d
mul!(vec_out, @view(A[:,i])', @view(A[:,j]))
end
end
return out
end
julia> A = CUDA.rand(ComplexF32, 2^18, 2^4)
julia> @benchmark CUDA.@sync old($A)
BenchmarkTools.Trial:
memory estimate: 48.97 KiB
allocs estimate: 2319
--------------
minimum time: 280.837 ms (0.00% GC)
median time: 310.700 ms (0.00% GC)
mean time: 306.896 ms (0.00% GC)
maximum time: 310.896 ms (0.00% GC)
--------------
samples: 17
evals/sample: 1
julia> @benchmark CUDA.@sync new($A)
BenchmarkTools.Trial:
memory estimate: 367.16 KiB
allocs estimate: 16580
--------------
minimum time: 5.712 ms (0.00% GC)
median time: 7.185 ms (0.00% GC)
mean time: 7.666 ms (1.52% GC)
maximum time: 47.268 ms (24.64% GC)
--------------
samples: 651
evals/sample: 1
Use the available tools, i.e. the profiler, to figure out what’s happening on your system.