Super hacky use of LoopVectorization
using LoopVectorization
function turbo_batched_matmul!(
Cc::AbstractArray{Complex{T},3},
Ac::AbstractArray{Complex{T},3},
Bc::AbstractArray{Complex{T},3},
) where {T}
C = reinterpret(T, Cc)
A = reinterpret(T, Ac)
B = reinterpret(reshape, T, Bc)
@tturbo vectorize = 3 for b β indices((A,B,C),(3,4,3)), n β indices((C, B), (2, 3)), m β indices((C, A), 1)
Cmn = zero(T)
for k β indices((A, B), (2, 2))
Amk = A[m, k, b]
Aperm = vpermilps177(Amk)
Cmn = vfmaddsub(Amk, B[1, k, n, b], vfmaddsub(Aperm, B[2, k, n, b], Cmn))
end
C[m, n, b] = Cmn
end
return Cc
end
using LinearAlgebra
BLAS.set_num_threads(1)
function batched_mul!(Z,X,Y)
Threads.@threads for n in axes(Z,3)
@inbounds mul!(view(Z,:,:,n) , view(X,:,:,n) , view(Y,:,:,n))
end
return Z
end
Z = Array{ComplexF64,3}(undef,512,8,24192);
X = rand(ComplexF64,512,16,24192);
Y = rand(ComplexF64,16,8,24192);
turbo_batched_matmul!(Z,X,Y) β batched_mul!(similar(Z),X,Y) # true
@benchmark turbo_batched_matmul!($Z,$X,$Y)
@benchmark batched_mul!($Z,$X,$Y)
I get
julia> turbo_batched_matmul!(Z,X,Y) β batched_mul!(similar(Z),X,Y)
true
julia> @benchmark turbo_batched_matmul!($Z,$X,$Y)
BenchmarkTools.Trial: 8 samples with 1 evaluation per sample.
Range (min β¦ max): 132.097 ms β¦ 148.658 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 136.215 ms β GC (median): 0.00%
Time (mean Β± Ο): 138.137 ms Β± 5.062 ms β GC (mean Β± Ο): 0.00% Β± 0.00%
β β ββ ββ β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
132 ms Histogram: frequency by time 149 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark batched_mul!($Z,$X,$Y)
BenchmarkTools.Trial: 10 samples with 1 evaluation per sample.
Range (min β¦ max): 109.267 ms β¦ 111.225 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 110.093 ms β GC (median): 0.00%
Time (mean Β± Ο): 110.167 ms Β± 564.070 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
β β β β β ββ β β β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
109 ms Histogram: frequency by time 111 ms <
Memory estimate: 16.81 KiB, allocs estimate: 162.
Looks like the obvious solution wins here.
Something weird is happening with LVβs threading
julia> @inline function LoopVectorization._choose_num_threads(
C::T,
NT::UInt,
x::Base.BitInteger
) where {T<:Union{Float32,Float64}}
15
end
julia> @benchmark turbo_batched_matmul!($Z,$X,$Y)
BenchmarkTools.Trial: 10 samples with 1 evaluation per sample.
Range (min β¦ max): 109.023 ms β¦ 109.778 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 109.225 ms β GC (median): 0.00%
Time (mean Β± Ο): 109.313 ms Β± 257.286 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
β β β β β β ββ β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
109 ms Histogram: frequency by time 110 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
Lowering it from 16 to 15 on this 16 core computer boosts performance.
Also cool, LV does 20 batched multiplies with 2.95e10 instructions, while the @threads mul!
loop requires 2.9x the instructions, at 8.6e10:
julia> using LinuxPerf # this is with LV hacked to use only 15 threads
julia> @time @pstats for _ in 1:20; turbo_batched_matmul!(Z,X,Y); end
β Warning: LinuxPerf.EventTypeExt(hw:stalled_cycles_backend, false, 0x0000000000000006) not supported, skipping
β @ LinuxPerf ~/.julia/packages/LinuxPerf/Ylq05/src/LinuxPerf.jl:303
2.212762 seconds (33.80 k allocations: 1.962 MiB, 1.10% compilation time)
βββββββββββββββββββββββββββββββββββββββββββ
β cpu-cycles 1.63e+11 100.0% # 5.0 cycles per ns
β stalled-cycles-frontend 6.25e+08 100.0% # 0.4% of cycles
β instructions 2.95e+10 100.0% # 0.2 insns per cycle
β branch-instructions 4.40e+08 100.0% # 1.5% of insns
β branch-misses 6.16e+05 100.0% # 0.1% of branch insns
β task-clock 3.28e+10 100.0% # 32.8 s
β context-switches 0.00e+00 100.0%
β cpu-migrations 0.00e+00 100.0%
β page-faults 0.00e+00 100.0%
aggregated from 15 threads
βββββββββββββββββββββββββββββββββββββββββββ
julia> @time @pstats for _ in 1:20; batched_mul!(Z,X,Y); end
β Warning: LinuxPerf.EventTypeExt(hw:stalled_cycles_backend, false, 0x0000000000000006) not supported, skipping
β @ LinuxPerf ~/.julia/packages/LinuxPerf/Ylq05/src/LinuxPerf.jl:303
2.242579 seconds (37.03 k allocations: 2.291 MiB, 1.08% compilation time)
βββββββββββββββββββββββββββββββββββββββββββ
β cpu-cycles 3.12e+11 100.0% # 4.8 cycles per ns
β stalled-cycles-frontend 2.56e+09 100.0% # 0.8% of cycles
β instructions 8.60e+10 100.0% # 0.3 insns per cycle
β branch-instructions 3.65e+09 100.0% # 4.2% of insns
β branch-misses 4.03e+06 100.0% # 0.1% of branch insns
β task-clock 6.52e+10 100.0% # 65.2 s
β context-switches 0.00e+00 100.0%
β cpu-migrations 0.00e+00 100.0%
β page-faults 1.00e+00 100.0%
aggregated from 32 threads
βββββββββββββββββββββββββββββββββββββββββββ