Hello,
I have noticed that Julia multithreading and Distributed scale very badly when calculating relatively heavy computations involving BLAS and LAPACK. As an example, here I show you the calculation of the expv
function, which calculates exp(A) * b
(with A
a square matrix and b
a vector) without directly calculating
exp(A)` but rather using the Arnoldi algorithm.
Here I show you the main code of the algorithm, but I donβt think it is necessary to study it in depth. It should be well written (correct me if it is not), involving only in-place matrix-vector multiplications mul!(y, A, x)
, where x
and y
can also be a view
, and then it involves exp(B)
where B
is a 30x30 matrix. I paste it here just for completeness
Arnoldi expv
struct ArnoldiSpace{VT<:AbstractMatrix{<:BlasFloat},HT<:AbstractMatrix{<:BlasFloat},mT<:Integer}
V::VT
H::HT
Hcopy::HT
m::mT
end
function Base.copy(AS::ArnoldiSpace{<:AbstractMatrix{T1},<:AbstractMatrix{T1}}) where {T1<:BlasFloat}
return ArnoldiSpace(copy(AS.V), copy(AS.H), copy(AS.Hcopy), AS.m)
end
function Base.deepcopy(AS::ArnoldiSpace{<:AbstractMatrix{T1},<:AbstractMatrix{T1}}) where {T1<:BlasFloat}
return ArnoldiSpace(deepcopy(AS.V), deepcopy(AS.H), deepcopy(AS.Hcopy), AS.m)
end
function arnoldi_init!(A, b::AbstractVector{T}, V::AbstractMatrix{T}, H::AbstractMatrix{T}) where {T<:BlasFloat}
vβ = view(V, :, 1)
vβ = view(V, :, 2)
vβ .= b
normalize!(vβ)
mul!(vβ, A, vβ)
H[1, 1] = dot(vβ, vβ)
axpy!(-H[1, 1], vβ, vβ)
H[2, 1] = norm(vβ)
return vβ ./= H[2, 1]
end
function arnoldi_step!(A, V::AbstractMatrix{T}, H::AbstractMatrix{T}, i::TI) where {T<:BlasFloat,TI<:Integer}
vα΅’ = view(V, :, i)
vα΅’ββ = view(V, :, i + 1)
mul!(vα΅’ββ, A, vα΅’)
for j in 1:i
vβ±Ό = view(V, :, j)
H[j, i] = dot(vβ±Ό, vα΅’ββ)
axpy!(-H[j, i], vβ±Ό, vα΅’ββ)
end
Ξ² = H[i+1, i] = norm(vα΅’ββ)
vα΅’ββ ./= H[i+1, i]
return Ξ²
end
function arnoldi!(
AS::ArnoldiSpace{<:AbstractMatrix{T1},<:AbstractMatrix{T1}},
A,
b::AbstractVector{T2},
) where {T1<:BlasFloat,T2<:BlasFloat}
n = size(A, 2)
V = AS.V
H = AS.H
m = AS.m
n == size(V, 1) || throw(DimensionMismatch())
n == length(b) || throw(DimensionMismatch())
arnoldi_init!(A, b, V, H)
for i in 2:m
arnoldi_step!(A, V, H, i)
end
return AS
end
function arnoldi(A, b::AbstractVector{T}, m::Integer) where {T<:BlasFloat}
n = size(A, 2)
V = similar(b, n, m + 1)
H = zeros(T, m + 1, m)
AS = ArnoldiSpace(V, H, copy(H), m)
return arnoldi!(AS, A, b)
end
### EXPV TOOLS ###
function expv!(
x::AbstractVector{T1},
AS::ArnoldiSpace{<:AbstractMatrix{T1},<:AbstractMatrix{T1}},
t::T2,
b::AbstractVector{T1},
) where {T1<:BlasFloat,T2<:Union{BlasFloat,BlasInt}}
H = AS.H
Hcopy = AS.Hcopy
V = AS.V
m = AS.m
Hm = view(H, 1:m, 1:m)
Vm = view(V, :, 1:m)
lmul!(t, Hm)
Hcopym = view(Hcopy, 1:m, 1:m)
copyto!(Hcopym, Hm)
expH = LinearAlgebra.exp!(Hcopym)
# expH = exp(Hm)
Ξ² = norm(b)
expHe = expH[:, 1] # the view doesn't work when using copyto! with CUDA
cache = similar(x, m)
copyto!(cache, expHe) # just in case expHe is different from cache (e.g., with CUDA)
mul!(x, Vm, cache)
lmul!(Ξ², x)
return x
end
function expv!(
x::AbstractVector{T1},
A,
t::T2,
b::AbstractVector{T1};
m::Int = min(30, cld(2 * length(b), 3)),
) where {T1<:BlasFloat,T2<:Union{BlasFloat,BlasInt}}
AS = arnoldi(A, b, m)
return expv!(x, AS, t, b)
end
function expv(
A,
t::T1,
b::AbstractVector{T2};
m::Int = min(30, cld(2 * length(b), 3)),
) where {T1<:BlasFloat,T2<:BlasFloat}
x = similar(b)
return expv!(x, A, t, b, m = m)
end
And then I create a file called expv.jl
to make some benchmarks
# expv.jl #
using Distributed
addprocs(Threads.nthreads())
##
@everywhere begin
using LinearAlgebra
using LinearAlgebra: BlasFloat, BlasInt
using SparseArrays
BLAS.set_num_threads(1)
end
using BenchmarkTools
using JLD2
##
N = parse(Int, ARGS[1])
T = ComplexF64
H = sprand(T, N, N, 0.1/N)
H = H + H' # Hermitianize
V = sprand(T, N, N, 0.01/N)
V = V + V' # Hermitianize
Ο0 = rand(T, N)
normalize!(Ο0)
t = 0.4
expv(-1im * H, t, Ο0)
function calc_serial(H, V, Ο0, t, g_list)
res = similar(g_list)
for i in eachindex(g_list)
g = g_list[i]
A = -1im * (H + g * V)
res[i] = abs2( dot(Ο0, expv(A, t, Ο0)) )
end
return res
end
function calc_multithread(H, V, Ο0, t, g_list)
res = similar(g_list)
Threads.@threads for i in eachindex(g_list)
g = g_list[i]
A = -1im * (H + g * V)
res[i] = abs2( dot(Ο0, expv(A, t, Ο0)) )
end
return res
end
function calc_distributed(H, V, Ο0, t, g_list)
res = pmap(g_list) do g
A = -1im * (H + g * V)
abs2( dot(Ο0, expv(A, t, Ο0)) )
end
return res
end
##
g_list = range(0.0, 0.1, 32)
# calc_serial(H, V, Ο0, t, g_list)
# calc_multithread(H, V, Ο0, t, g_list)
# calc_distributed(H, V, Ο0, t, g_list)
##
if Threads.nthreads() == 1
bench_serial = @benchmark calc_serial($H, $V, $Ο0, $t, $g_list)
mean_time_serial = 1e-9 * sum(bench_serial.times) / length(bench_serial.times)
jldopen("benchmarks.jld2", "a+") do file
file["mean_time_serial_$(Threads.nthreads())_$N"] = mean_time_serial
end
end
bench_multithread = @benchmark calc_multithread($H, $V, $Ο0, $t, $g_list)
bench_distributed = @benchmark calc_distributed($H, $V, $Ο0, $t, $g_list)
mean_time_multithread = 1e-9 * sum(bench_multithread.times) / length(bench_multithread.times)
mean_time_distributed = 1e-9 * sum(bench_distributed.times) / length(bench_distributed.times)
jldopen("benchmarks.jld2", "a+") do file
file["mean_time_multithread_$(Threads.nthreads())_$N"] = mean_time_multithread
file["mean_time_distributed_$(Threads.nthreads())_$N"] = mean_time_distributed
end
And then running the following bash script
#!/bin/bash
MAX_THREADS=32
for ((threads=1; threads<=MAX_THREADS; threads++)); do
# for ((N=1000; N<=100000; N*=10)); do
# julia -t $threads --project expv.jl $N
# echo "$threads,$N"
# done
N=100000
julia -t $threads --project expv.jl $N
echo "$threads,$N"
done
The benchmarks are very disappointing. Especially for the Distributed case, were I would expect better performances, due to the heavy computations. The Serial case is a clean horizontal line because I did only one benchmark.
The general system infos are
julia> versioninfo()
Julia Version 1.10.4
Commit 48d4fd48430 (2024-06-04 10:41 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 32 Γ 13th Gen Intel(R) Core(TM) i9-13900KF
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, goldmont)
Threads: 16 default, 0 interactive, 8 GC (on 32 virtual cores)
Environment:
JULIA_EDITOR = code
JULIA_NUM_THREADS = 16
Just to check if multithreading is working properly, I tested a simple matrix-vector multiplication
function matvec_mul!(y, A, x)
for i in 1:size(A, 1)
y[i] = 0
for j in 1:size(A, 2)
y[i] += A[i, j] * x[j]
end
end
y
end
function matvec_mul_threaded!(y, A, x)
Threads.@threads for i in 1:size(A, 1)
y[i] = 0
for j in 1:size(A, 2)
y[i] += A[i, j] * x[j]
end
end
y
end
And I get 88ms (Serial) vs 16ms (Multithreaded) on a ComplexF64
5000x5000 matrix, which seems more reasonable (but still just a factor 5.5) when using 16 threads.