Hi!
Such norm operation is slow, because it is a reducing operation (basically, it finds the maximum absolute value of the array), which is hard to execute in parallel.
What I recommend is just reducing the number of norm calls as much as possible. Something like this:
function expCU2(A_gpu::CuArray{Float64,2};threshold=1e-6)
rows = LinearAlgebra.checksquare(A_gpu);
P = spzeros(rows,rows);
P[diagind(P)] = ones(rows);
mat_norm=norm(A_gpu,Inf);
# Scaling to ensure matrix doesn't blow up ensuring significant speed up
scaling_factor = nextpow(2,mat_norm);
A_gpu = A_gpu./scaling_factor;
mat_norm /= scaling_factor
P_gpu = CuArray{Float64,2}(P);
next_term = copy(P_gpu);
n = 1;
delta=one(mat_norm);
nonzero_tol = 1e-12;
while delta>threshold
next_term=(1/n)*A_gpu*next_term;
# timing this step
delta *= mat_norm / sqrt(n) # Not accurate but OK heuristics
P_gpu = P_gpu.+ next_term
n += 1
end
# Squaring of P to generate correct P
for n=1:log2(scaling_factor)
P_gpu=P_gpu*P_gpu;
end
return P_gpu
end
These are the benchmark results I got:
julia> using BenchmarkTools
julia> @btime expCU(A_gpu);
849.408 ms (4459 allocations: 7.90 MiB)
julia> @btime expCU2(A_gpu);
29.819 ms (3246 allocations: 7.84 MiB)
Also, speaking of norm timing, it seems that it is because you allocate new memory for next_term each iteration. You may want to try creating next_term_buffer = similar(next_term) before the loop and
mul!(next_term_buffer, next_term, 1/n, 0)
next_term_buffer, next_term = next_term, next_term_buffer
in the loop.