I’m debugging a complex program within infiltrator, and this is the log:

```
infil> @inline function my_set!(ψ_sm, ψ, atomic)
sm = atomic.sub_mesh
np = sm.np_local
@inbounds @simd for i_p = 1 : np
ψ_sm[i_p] = ψ[sm.mapping_to_total[i_p]]
end
return
end
my_set! (generic function with 1 method)
infil> @inline function my_mul!(βψ, β, ψ_sm, atomic, o_start, coeff)
sm = atomic.sub_mesh
np = sm.np_local
@views mul!(βψ[:, o_start], β', ψ_sm[1:np], coeff, 0.0)
return
end
my_mul! (generic function with 1 method)
infil> function my_whole_wrapper(βψ, β, atomic, ψ_sm, ψ, o_start, coeff)
my_set!(ψ_sm, ψ, atomic)
my_mul!(βψ, β, ψ_sm, atomic, o_start, coeff)
return
end
my_whole_wrapper (generic function with 1 method)
infil> @btime my_set!(ψ_sm, ψ, beta_data.atomic[1]);
5.690 μs (0 allocations: 0 bytes)
infil> @btime my_mul!(beta_data.atomic[1].βψ, beta_data.atomic[1].phased_β, ψ_sm, beta_data.atomic[1], o_start, coeff)
23.377 μs (5 allocations: 176 bytes)
infil> @btime my_whole_wrapper(beta_data.atomic[1].βψ, beta_data.atomic[1].phased_β, beta_data.atomic[1], ψ_sm, ψ, o_start, coeff)
36.725 μs (5 allocations: 176 bytes)
```

The time cost for `my_whole_wrapper`

is about 7μs longer than the sum of `my_set!`

and `my_mul!`

.

The test is run in a node when the CPU usage is about 33%. The hyperthreading is turned on, so actually about 66% of physical cores (that is to say, 32 out of 48) are used. If the same test is performed when the CPU usage is low, the same problem still occurs:

```
infil> @btime my_set!(ψ_sm, ψ, beta_data.atomic[1]);
4.831 μs (0 allocations: 0 bytes)
infil> @btime my_mul!(beta_data.atomic[1].βψ, beta_data.atomic[1].phased_β, ψ_sm, beta_data.atomic[1], o_start, coeff)
18.458 μs (5 allocations: 176 bytes)
infil> @btime my_whole_wrapper(beta_data.atomic[1].βψ, beta_data.atomic[1].phased_β, beta_data.atomic[1], ψ_sm, ψ, o_start, coeff)
30.517 μs (5 allocations: 176 bytes)
```

And, despite all three functions take less time, the time difference is still around 7μs.

I’m curious what makes this difference. I’m using MKL, setting BLAS threads to 1, and OMP threads to 1.

```
infil> BLAS.get_config()
LinearAlgebra.BLAS.LBTConfig
Libraries:
├ [ILP64] libmkl_rt.so
└ [ LP64] libmkl_rt.so
infil> BLAS.get_num_threads()
1
infil> ENV["OMP_NUM_THREADS"]
"1"
```

More info:

```
Julia Version 1.9.4
Commit 8e5136fa297 (2023-11-14 08:46 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 96 × Intel(R) Xeon(R) Gold 6240R CPU @ 2.40GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.6 (ORCJIT, cascadelake)
Threads: 1 on 96 virtual cores
Environment:
LD_LIBRARY_PATH = /public/opt/tools/julia/julia-1.9.4/lib/julia:/public/opt/tools/julia/julia-1.9.4/lib
```

The above benchmarks have a little problem: variables are not interpolated. I’ve redone the low CPU usage example, with little difference:

```
infil> @btime my_set!(ψ_sm, ψ, beta_data.atomic[1]);
4.822 μs (0 allocations: 0 bytes)
infil> @btime my_mul!(beta_data.atomic[1].βψ, beta_data.atomic[1].phased_β, ψ_sm, beta_data.atomic[1], o_start, coeff)
17.444 μs (5 allocations: 176 bytes)
infil> @btime my_whole_wrapper(beta_data.atomic[1].βψ, beta_data.atomic[1].phased_β, beta_data.atomic[1], ψ_sm, ψ, o_start, coeff)
30.540 μs (5 allocations: 176 bytes)
infil> @btime my_set!($ψ_sm, $ψ, $beta_data.atomic[1]);
4.755 μs (0 allocations: 0 bytes)
infil> @btime my_mul!($beta_data.atomic[1].βψ, $beta_data.atomic[1].phased_β, $ψ_sm, $beta_data.atomic[1], $o_start, $coeff)
17.040 μs (5 allocations: 176 bytes)
infil> @btime my_whole_wrapper($beta_data.atomic[1].βψ, $beta_data.atomic[1].phased_β, $beta_data.atomic[1], $ψ_sm, $ψ, $o_start, $coeff)
29.823 μs (5 allocations: 176 bytes)
```

All tests are a little bit quicker, but the difference is still large.

A not-so-satisfactory MWE:

```
using Random
using LinearAlgebra
mutable struct Submesh
np_local::Int
mapping_to_total::Vector{Int}
end
mutable struct Atomic
sub_mesh::Submesh
βψ::Matrix{ComplexF64}
β::Matrix{ComplexF64}
end
function get_data()
atomics = Vector{Atomic}(undef, 20)
for i = 1 : 20
if i <= 5
np_local = 5000
n_β = 9
elseif i <= 10
np_local = 6000
n_β = 18
else
np_local = 7000
n_β = 13
end
mapping_to_total = zeros(Int, np_local)
offset = 8000 * (i-1)
for ip = 1 : np_local
mapping_to_total[ip] = ceil(Int, rand() * np_local) + offset
end
atomics[i] = Atomic(
Submesh(np_local, mapping_to_total),
zeros(ComplexF64, n_β, 30),
zeros(ComplexF64, np_local, n_β)
)
rand!(atomics[i].β)
end
ψ = zeros(ComplexF64, 160000)
rand!(ψ)
ψ_sm_data = zeros(ComplexF64, 10000)
ψ_sm = view(ψ_sm_data, 1001:8000)
coeff = 1.0 / (40 * 40 * 108)
o_start = 1
return atomics, ψ, ψ_sm_data, ψ_sm, coeff, o_start
end
@inline function my_set!(ψ_sm, ψ, atomic)
sm = atomic.sub_mesh
np = sm.np_local
@inbounds @simd for i_p = 1 : np
ψ_sm[i_p] = ψ[sm.mapping_to_total[i_p]]
end
return
end
@inline function my_mul!(βψ, β, ψ_sm, atomic, o_start, coeff)
sm = atomic.sub_mesh
np = sm.np_local
@views mul!(βψ[:, o_start], β', ψ_sm[1:np], coeff, 0.0)
return
end
function my_whole_wrapper!(βψ, β, atomic, ψ_sm, ψ, o_start, coeff)
my_set!(ψ_sm, ψ, atomic)
my_mul!(βψ, β, ψ_sm, atomic, o_start, coeff)
return
end
@noinline function my_set2!(ψ_sm, ψ, atomic)
sm = atomic.sub_mesh
np = sm.np_local
@inbounds @simd for i_p = 1 : np
ψ_sm[i_p] = ψ[sm.mapping_to_total[i_p]]
end
return
end
@noinline function my_mul2!(βψ, β, ψ_sm, atomic, o_start, coeff)
sm = atomic.sub_mesh
np = sm.np_local
@views mul!(βψ[:, o_start], β', ψ_sm[1:np], coeff, 0.0)
return
end
function my_whole_wrapper2!(βψ, β, atomic, ψ_sm, ψ, o_start, coeff)
my_set2!(ψ_sm, ψ, atomic)
my_mul2!(βψ, β, ψ_sm, atomic, o_start, coeff)
return
end
function my_set3!(ψ_sm, ψ, atomic)
sm = atomic.sub_mesh
np = sm.np_local
@inbounds @simd for i_p = 1 : np
ψ_sm[i_p] = ψ[sm.mapping_to_total[i_p]]
end
return
end
function my_mul3!(βψ, β, ψ_sm, atomic, o_start, coeff)
sm = atomic.sub_mesh
np = sm.np_local
@views mul!(βψ[:, o_start], β', ψ_sm[1:np], coeff, 0.0)
return
end
function my_whole_wrapper3!(βψ, β, atomic, ψ_sm, ψ, o_start, coeff)
my_set3!(ψ_sm, ψ, atomic)
my_mul3!(βψ, β, ψ_sm, atomic, o_start, coeff)
return
end
```

It is not so satisfactory for two reasons:

- It has no allocations, while the original example has. I’ll dig it out later.
- The overhead is smaller in this MWE, about 10-20%.

But the difference still exists, and it should suffice as a MWE.

My observations:

- Inlining has little to do with this overhead.
- The number of BLAS threads has a great impact, but even when it is set to 1, there is also aforementioned 10-20% overhead. If it is set to default (48 in my case), there will be about 100% overhead.
- The
`OMP_NUM_THREADS`

variable may affect the speed, but the overhead remains.