Perhaps needs a Tullio issue:
julia> using LoopVectorization, Tullio, BenchmarkTools, Compat
julia> using LinearAlgebra; BLAS.vendor()
:mkl
julia> function f(A, B)
C = similar(A, size(A, 1), size(B, 2), size(A, 3))
for i in axes(A, 3)
mul!(@view(C[:, :, i]), @view(A[:, :, i]), B)
end
C
end
f (generic function with 1 method)
julia> g(A, B) = @tullio C[i, j, k] := A[i, l, k] * B[l, j]
g (generic function with 1 method)
julia> function h(A::AbstractArray{<:Any, 3}, B::AbstractMatrix)
C = similar(A, size(A, 1), size(B, 2), size(A, 3))
old_threads = Compat.get_num_threads() # BLAS.get_num_threads(), soon
Compat.set_num_threads(1)
Threads.@threads for i in axes(A, 3)
@views mul!(C[:, :, i], A[:, :, i], B)
end
Compat.set_num_threads(old_threads)
C
end;
julia> function AmulB!(C, A, B)
@avx for m ∈ axes(A,1), n ∈ axes(B,2)
Cmn = zero(eltype(C))
for k ∈ axes(A,2)
Cmn += A[m,k] * B[k,n]
end
C[m,n] = Cmn
end
end
AmulB! (generic function with 1 method)
julia> function k(A::AbstractArray{<:Any, 3}, B::AbstractMatrix)
C = similar(A, size(A, 1), size(B, 2), size(A, 3))
Threads.@threads for i in axes(A, 3)
@views AmulB!(C[:, :, i], A[:, :, i], B)
end
C
end;
julia> versioninfo()
Julia Version 1.5.3-pre.0
Commit 5905edebb1 (2020-09-24 05:09 UTC)
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, tigerlake)
julia> let A = randn(200, 201, 202), B = randn(201, 203)
@btime f($A, $B) # simple mul!
@btime g($A, $B) # @tullio
@btime h($A, $B) # threaded
@btime k($A, $B) # threaded loopvectorization
end;
33.038 ms (2 allocations: 62.57 MiB)
45.259 ms (52 allocations: 62.57 MiB)
26.197 ms (23 allocations: 62.57 MiB)
25.306 ms (23 allocations: 62.57 MiB)
LoopVectorization with @threads
takes 25 ms, vs 45 ms for Tullio.
On a desktop:
julia> using LoopVectorization, Tullio, BenchmarkTools, Compat
[ Info: Precompiling Compat [34da2185-b29b-5c13-b0c7-acf172513d20]
julia> using LinearAlgebra; BLAS.vendor()
:mkl
julia> function f(A, B)
C = similar(A, size(A, 1), size(B, 2), size(A, 3))
for i in axes(A, 3)
mul!(@view(C[:, :, i]), @view(A[:, :, i]), B)
end
C
end
f (generic function with 1 method)
julia> g(A, B) = @tullio C[i, j, k] := A[i, l, k] * B[l, j]
g (generic function with 1 method)
julia> function h(A::AbstractArray{<:Any, 3}, B::AbstractMatrix)
C = similar(A, size(A, 1), size(B, 2), size(A, 3))
old_threads = Compat.get_num_threads() # BLAS.get_num_threads(), soon
Compat.set_num_threads(1)
Threads.@threads for i in axes(A, 3)
@views mul!(C[:, :, i], A[:, :, i], B)
end
Compat.set_num_threads(old_threads)
C
end;
julia> function AmulB!(C, A, B)
@avx for m ∈ axes(A,1), n ∈ axes(B,2)
Cmn = zero(eltype(C))
for k ∈ axes(A,2)
Cmn += A[m,k] * B[k,n]
end
C[m,n] = Cmn
end
end
AmulB! (generic function with 1 method)
julia> function k(A::AbstractArray{<:Any, 3}, B::AbstractMatrix)
C = similar(A, size(A, 1), size(B, 2), size(A, 3))
Threads.@threads for i in axes(A, 3)
@views AmulB!(C[:, :, i], A[:, :, i], B)
end
C
end;
julia> versioninfo()
Julia Version 1.5.3-pre.0
Commit 5905edebb1* (2020-09-24 05:09 UTC)
Platform Info:
OS: Linux (x86_64-generic-linux)
CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, cascadelake)
Environment:
JULIA_NUM_THREADS = 18
julia> let A = randn(200, 201, 202), B = randn(201, 203)
@btime f($A, $B) # simple mul!
@btime g($A, $B) # @tullio
@btime h($A, $B) # threaded
@btime k($A, $B) # threaded loopvectorization
end;
16.672 ms (2 allocations: 62.57 MiB)
16.414 ms (244 allocations: 62.59 MiB)
3.724 ms (93 allocations: 62.58 MiB)
3.382 ms (93 allocations: 62.58 MiB)