Ah, thanks for the explanation! Could you please share the test you made? For me here the speedup only occurs for dense matrices; for sparse, the transpose method `f2b!`

is considerably slower than `f2!`

but I may be messing something up. I used the function `f2b!`

as

```
function f2_nobroadcasting_usetranspose!(result, A, Bt)
for (ind, r, c) in zip(eachindex(result), eachrow(A), eachrow(Bt))
result[ind] = dot(r, c)
end
nothing
end
```

(the code that you shared for yours is using B and eachcol(A), which I don’t think is correct).

##
Tests

```
using BenchmarkTools, LinearAlgebra, Test, SparseArrays
using Tullio, LoopVectorization
function f1!(tmp, result, A, B)
tmp .= A .* B'
sum!(result, tmp)
nothing
end
function f2!(result, A, B)
result .= dot.(eachrow(A), eachcol(B))
nothing
end
function f2_nobroadcasting!(result, A, B)
for (ind, r, c) in zip(eachindex(result), eachrow(A), eachcol(B))
result[ind] = dot(r, c)
end
nothing
end
function f2_nobroadcasting_usetranspose!(result, A, Bt)
for (ind, r, c) in zip(eachindex(result), eachrow(A), eachrow(Bt))
result[ind] = dot(r, c)
end
nothing
end
function f3!(result, A, B)
@tullio result[i] = A[i,j]*B[j,i]
nothing
end
@testset verbose = true "Diagonal of matrix multiplication" begin
N = 10000
@testset "Dense case" begin
A = randn(N,N); B = randn(N,N);
tmp = randn(N, N); result = randn(N);
@btime f1!($tmp, $result, $A, $B);
result_1 = deepcopy(result);
@btime f2!($result, $A, $B);
result_2 = deepcopy(result);
@btime f2_nobroadcasting!($result, $A, $B);
result_2ndb = deepcopy(result);
Bt = transpose(B)
@btime f2_nobroadcasting_usetranspose!($result, $A, $Bt);
result_2ndb_tran = deepcopy(result);
@btime f3!($result, $A, $B);
result_3 = deepcopy(result);
@test result_1 ≈ result_2
@test result_1 ≈ result_2ndb
@test result_1 ≈ result_2ndb_tran
@test result_1 ≈ result_3
end
@testset "Sparse case" begin
d = 1e-4
sA = sprand(N,N, d); sB = sprand(N,N, d);
stmp = sA * sB'; sresult = randn(N);
@btime f1!($stmp, $sresult, $sA, $sB);
result_1s = deepcopy(sresult);
@btime f2!($sresult, $sA, $sB);
result_2s = deepcopy(sresult);
@btime f2_nobroadcasting!($sresult, $sA, $sB);
result_2ndbs = deepcopy(sresult);
sBt = transpose(sB)
@btime f2_nobroadcasting_usetranspose!($sresult, $sA, $sBt);
result_2ndb_tran_s = deepcopy(sresult);
@btime f3!($sresult, $sA, $sB);
result_3s = deepcopy(sresult);
@test result_1s ≈ result_2s
@test result_1s == result_2ndb_tran_s
@test result_1s == result_2ndbs
@test result_1s == result_3s
end
end
```

Gives

##
Results

```
#Dense
##N = 10
# 197.969 ns (0 allocations: 0 bytes)
# 348.294 ns (2 allocations: 992 bytes)
# 221.057 ns (0 allocations: 0 bytes)
# 156.080 ns (0 allocations: 0 bytes) **almost 2x speedup over non-transpose
# 71.441 ns (0 allocations: 0 bytes) ** Tullio
##N=1000
# 3.363 ms (0 allocations: 0 bytes)
# 2.207 ms (4 allocations: 78.22 KiB)
# 2.154 ms (0 allocations: 0 bytes)
# 2.316 ms (0 allocations: 0 bytes)
# 1.625 ms (0 allocations: 0 bytes) ** Tullio
##N = 10000
# 1.219 s (0 allocations: 0 bytes)
# 1.230 s (4 allocations: 781.34 KiB)
# 1.020 s (0 allocations: 0 bytes)
# 1.138 s (0 allocations: 0 bytes)
# 218.114 ms (0 allocations: 0 bytes) ***
#Sparse
##N=10
# 383.941 ns (7 allocations: 544 bytes)
# 282.670 ns (2 allocations: 1.59 KiB)
# 109.996 ns (0 allocations: 0 bytes) **
# 329.524 ns (0 allocations: 0 bytes) (transpose is slower!)
# 300.054 ns (0 allocations: 0 bytes)
##N = 1000
# 11.805 μs (9 allocations: 17.78 KiB)
# 22.150 μs (4 allocations: 140.72 KiB)
# 11.354 μs (0 allocations: 0 bytes) **
# 2.840 ms (0 allocations: 0 bytes)
# 3.092 ms (0 allocations: 0 bytes)
##N = 10000
# 395.792 μs (11 allocations: 311.41 KiB)
# 481.284 μs (4 allocations: 1.37 MiB)
# 346.121 μs (0 allocations: 0 bytes) **
# 1.405 s (0 allocations: 0 bytes)
# 649.908 ms (0 allocations: 0 bytes)
```