Yes, this is what @tullio out3
above was trying to say, probably much too obscurely.
NNlib.batched_mul
exists to wrap these functions. On the CPU it falls back to a loop over slices (so won’t beat a loop you write yourself).
Note that some of these are Float32 and some Float64. It also times the copying of the data to the GPU, which I’m not sure you want to include.
Below are my attempts to time this… not terribly conclusive though, much will depend on what sizes you pick, and on your machine.
julia> function loop1(A,B,C,N=size(C,3)÷2)
res3 = zeros(N)
for pq in 1:N
res3[pq] = tr( transpose(@view A[:, 2*pq-1, :]) * B * (@view C[:, :, 2*pq]) )
end
res3
end
loop1 (generic function with 2 methods)
julia> function loop2(A,B,C,N=size(C,3)÷2)
res3 = zeros(N)
th = BLAS.get_num_threads()
BLAS.set_num_threads(1)
Threads.@threads for pq in 1:N
res3[pq] = tr( transpose(@view A[:, 2*pq-1, :]) * B * (@view C[:, :, 2*pq]) )
end
BLAS.set_num_threads(th)
res3
end
loop2 (generic function with 2 methods)
julia> using NNlib, TensorCore, Tullio
julia> function nnlib1(A,B,C)
ABC = @views permutedims(A[:, 1:2:end, :],(3,1,2)) ⊠ B ⊠ C[:, :, 2:2:end];
tr.(eachslice(ABC; dims=3))
end
nnlib1 (generic function with 1 method)
julia> function tullio3(A,B,C)
BC = batched_mul(B, @view C[:, :, 2:2:end]) # threaded loop (on Arrays)
@tullio out3[b] := A[i,2b-1,j] * BC[i,j,b]
end
tullio3 (generic function with 1 method)
julia> function tullio4(A,B,C)
BC = boxdot(B, C[:, :, 2:2:end]) # reshape & mul
@tullio out3[b] := A[i,2b-1,j] * BC[i,j,b]
end
tullio4 (generic function with 1 method)
julia> function bcmichael_vecbatch(A,B,C,N=size(C,3)÷2) # changed to make N local
Abatch = [A[:, 2*pq-1, :] for pq = 1:N ];
Cbatch = [ C[:, :, 2*pq] for pq = 1:N ];
res = [sum( Abatch[pq] .* (B * Cbatch[pq])) for pq=1:N]
end
bcmichael_vecbatch (generic function with 1 method)
julia> function bcmichael_vecbatch_views(A,B,C,N=size(C,3)÷2)
@views Abatch = [A[:, 2*pq-1, :] for pq = 1:N ];
@views Cbatch = [ C[:, :, 2*pq] for pq = 1:N ];
res = [sum( Abatch[pq] .* (B * Cbatch[pq])) for pq=1:N]
end
bcmichael_vecbatch_views (generic function with 1 method)
julia> using BenchmarkTools, LinearAlgebra
julia> let N = 50, Nt = 30, T = Float64, move=Array
println(move, "{$T}, N=$N, Nt=$Nt")
A = move(randn(T,Nt,2*N,2*N)); C = move(randn(T,Nt,2*N,2*N)); B = move(randn(T,Nt,Nt));
println("loops over tr( * * )")
res1 = @btime loop1($A,$B,$C)
res2 = @btime loop2($A,$B,$C)
println("batched_mul then tr")
res3 = @btime nnlib1($A,$B,$C)
println("one mul then tullio")
res4 = @btime tullio3($A,$B,$C)
res5 = @btime tullio4($A,$B,$C)
println("bcmichael_vecbatch")
res6 = @btime bcmichael_vecbatch($A,$B,$C)
res6 = @btime bcmichael_vecbatch_views($A,$B,$C)
res1 ≈ res2 ≈ res3 ≈ res4 ≈ res5 ≈ res6
end
Array{Float32}, N=50, Nt=30
loops over tr( * * )
min 499.458 μs, mean 957.305 μs (302 allocations, 4.97 MiB)
min 543.333 μs, mean 823.141 μs (324 allocations, 4.97 MiB)
batched_mul then tr
min 1.318 ms, mean 1.804 ms (181 allocations, 6.16 MiB)
one mul then tullio
min 385.417 μs, mean 461.693 μs (41 allocations, 1.15 MiB)
min 284.209 μs, mean 575.084 μs (10 allocations, 2.29 MiB)
bcmichael_vecbatch
min 600.625 μs, mean 1.070 ms (606 allocations, 4.59 MiB)
min 274.792 μs, mean 559.201 μs (308 allocations, 2.30 MiB)
true
julia> versioninfo()
Julia Version 1.11.0-DEV.901
Commit 4bc45a7f0a* (2023-11-14 16:49 UTC)
Platform Info:
OS: macOS (arm64-apple-darwin21.6.0)
CPU: 8 × Apple M1
WORD_SIZE: 64
LLVM: libLLVM-15.0.7 (ORCJIT, apple-m1)
Threads: 5 on 4 virtual cores
Environment:
JULIA_NUM_THREADS = 4
# on another machine, xeon with openblas, and a GPU
Array{Float32}, N=50, Nt=30
loops over tr( * * )
3.055 ms (251 allocations: 4.96 MiB)
1.069 ms (222 allocations: 4.97 MiB)
batched_mul then tr
5.805 ms (128 allocations: 6.15 MiB)
one mul then tullio
729.539 μs (40 allocations: 1.15 MiB)
1.360 ms (9 allocations: 2.29 MiB)
bcmichael_vecbatch
2.095 ms (403 allocations: 4.59 MiB)
1.449 ms (203 allocations: 2.30 MiB)
true
julia> using CUDA, KernelAbstractions
julia> function tullio3(A,B,C) # NB @tullio must be after using CUDA, KernelAbstractions
BC = batched_mul(B, @view C[:, :, 2:2:end])
@tullio out3[b] := A[i,2b-1,j] * BC[i,j,b]
end
tullio3 (generic function with 1 method)
julia> function tullio4(A,B,C)
BC = boxdot(B, C[:, :, 2:2:end]) # reshape & mul
@tullio out3[b] := A[i,2b-1,j] * BC[i,j,b]
end
tullio4 (generic function with 1 method)
julia> let N = 50, Nt = 300, T = Float32, move=CuArray
println(move, "{$T}, N=$N, Nt=$Nt")
A = move(randn(T,Nt,2*N,2*N)); C = move(randn(T,Nt,2*N,2*N)); B = move(randn(T,Nt,Nt));
println("loops over tr( * * )")
res1 = @btime loop1($A,$B,$C)
# res2 = @btime loop2($A,$B,$C)
println("batched_mul then tr")
res3 = @btime nnlib1($A,$B,$C)
println("one mul then tullio")
res4 = Array(@btime CUDA.@sync tullio3($A,$B,$C)) # the function returns a CuArray
res5 = Array(@btime CUDA.@sync tullio4($A,$B,$C))
println("bcmichael_vecbatch")
res6 = @btime bcmichael_vecbatch($A,$B,$C)
res6 = @btime bcmichael_vecbatch_views($A,$B,$C)
res1 ≈ res1 ≈ res3 ≈ res4 ≈ res5 ≈ res6
end
CuArray{Float32}, N=50, Nt=300
loops over tr( * * )
9.277 ms (7753 allocations: 377.64 KiB) # NB this is only CPU alloc, not GPU
batched_mul then tr
3.959 ms (3705 allocations: 178.12 KiB)
one mul then tullio
4.957 ms (103 allocations: 4.12 KiB)
5.064 ms (194 allocations: 7.25 KiB)
bcmichael_vecbatch
12.022 ms (9955 allocations: 496.25 KiB)
12.407 ms (6755 allocations: 363.16 KiB)
true
julia> CUDA.device()
CuDevice(0): Tesla V100-PCIE-16GB