Edit: corrected after @lmiq `s observations…
Here is a modified MWE, adding the proposals of the thread and using @aplavin’s nice trick Y.A.t.Q : Yet Another @threads Question - #13 by aplavin
using LinearAlgebra
using LoopVectorization
using Tullio
using BenchmarkTools
function normal!(C,Cx,x,A,B)
temp = Vector{Float64}(undef,size(A,2)*size(B,2))
# @fastmath @views @inbounds for n = 1:size(A,1)
@views for n = 1:size(A,1)
kron!(temp,A[n,:],B[n,:])
C .= C.+ temp.*temp'
Cx .= Cx .+ temp.*x[n]
end
return Cx
end
function normalTurbo!(C,Cx,x,A,B)
temp = Vector{Float64}(undef,size(A,2)*size(B,2))
# @fastmath @views @inbounds for n = 1:size(A,1)
@views for n = 1:size(A,1)
kron!(temp,A[n,:],B[n,:])
@turbo C .= C.+ temp.*temp'
@turbo Cx .= Cx .+ temp.*x[n]
end
return Cx
end
function normalTturbo!(C,Cx,x,A,B)
temp = Vector{Float64}(undef,size(A,2)*size(B,2))
# @fastmath @views @inbounds for n = 1:size(A,1)
@views for n = 1:size(A,1)
kron!(temp,A[n,:],B[n,:])
@tturbo C .= C.+ temp.*temp'
@tturbo Cx .= Cx .+ temp.*x[n]
end
return Cx
end
# swapped arguments
function normalEquations!(C,Cx,x,A,B)
temp = Matrix{Float64}(undef,size(B,2),size(A,2))
C4 = reshape(C, size(B,2),size(A,2),size(B,2),size(A,2)) # writing into original C
C4x = reshape(Cx, size(B,2),size(A,2))
# @fastmath @inbounds @turbo for m = 1:size(B,2)
@turbo for m = 1:size(B,2)
for r = 1:size(A,2)
for mm = 1:size(B,2)
for rr = 1:size(A,2)
for n = 1:size(B,1)
temp = A[n,r] * B[n,m]
C4[m,r,mm,rr] = C4[m,r,mm,rr] + temp * A[n,rr] * B[n,mm]
C4x[m,r] = C4x[m,r] + temp*x[n]
end
end
end
end
end
return Cx
end
# swapped arguments
function normalEquationsP!(C,Cx,x,A,B)
temp = Matrix{Float64}(undef,size(B,2),size(A,2))
C4 = reshape(C, size(B,2),size(A,2),size(B,2),size(A,2)) # writing into original C
C4x = reshape(Cx, size(B,2),size(A,2))
# @fastmath @inbounds @tturbo for m = 1:size(B,2)
@tturbo for m = 1:size(B,2)
for r = 1:size(A,2)
for mm = 1:size(B,2)
for rr = 1:size(A,2)
for n = 1:size(B,1)
temp = A[n,r] * B[n,m]
C4[m,r,mm,rr] = C4[m,r,mm,rr] + temp * A[n,rr] * B[n,mm]
C4x[m,r] = C4x[m,r] + temp*x[n]
end
end
end
end
end
return Cx
end
# swapped arguments
function normalEquationsTullio!(C,Cx,x,A,B)
C4 = reshape(C, size(B,2),size(A,2),size(B,2),size(A,2)) # writing into original C
Cx4 = reshape(Cx, size(B,2),size(A,2))
@tullio C4[j,i,l,k] = A[n,i] * B[n,j] * A[n,k] * B[n,l]
@tullio Cx4[j,i] = A[n,i] * B[n,j] * x[n]
return Cx
end
function normalEquationsMatlab(CC,Cx,A,B,x)
batchSize = 10000;
for n = 1:batchSize:size(A,1)
idx = min(n+batchSize-1,size(A,1));
temp = repeat(A[n:idx,:],1,size(B,2)).*kron(B[n:idx,:], ones(1, size(A,2)));
CC .= CC.+temp'*temp;
Cx .= Cx.+temp'*x[n:idx];
end
return Cx
end
function normalEquationsMatlabJulia(CC,Cx,A,B,x)
batchSize = 10000;
stencil = ones(1, size(A,2))
# @fastmath @inbounds @views for n = 1:batchSize:size(A,1)
@views for n = 1:batchSize:size(A,1)
idx = min(n+batchSize-1,size(A,1))
@turbo temp = repeat(A[n:idx,:],1,size(B,2)).*kron(B[n:idx,:],stencil )
@turbo CC .= CC.+temp'*temp
@turbo Cx .= Cx.+temp'*x[n:idx]
end
return Cx
end
N = 10000;
R = 20;
M = 40;
A = rand(N,M);
B = rand(N,R);
C = rand(M*R,M*R);
x = rand(N,);
Cx = rand(M*R,);
A_r = PermutedDimsArray(permutedims(A), (2, 1))
B_r = PermutedDimsArray(permutedims(B), (2, 1))
Cp = copy(C)
Cxp = copy(Cx)
normal!(Cp,Cxp,x,A,B_r);
CNormal = copy(Cp)
CxNormal = copy(Cxp)
Cp = copy(C)
Cxp = copy(Cx)
normalTurbo!(Cp,Cxp,x,A,B);
@assert Cp ≈ CNormal
@assert Cxp ≈ CxNormal
Cp = copy(C)
Cxp = copy(Cx)
normalTturbo!(Cp,Cxp,x,A,B);
@assert Cp ≈ CNormal
@assert Cxp ≈ CxNormal
Cp = copy(C)
Cxp = copy(Cx)
normalEquations!(Cp,Cxp,x,B_r,A);
CNormalEquations = copy(Cp)
println(norm(CNormalEquations-CNormal))
CxNormalEquations = copy(Cxp)
println(norm(CxNormalEquations-CxNormal))
Cp = copy(C)
Cxp = copy(Cx)
normalEquationsP!(Cp,Cxp,x,B,A);
@assert Cp ≈ CNormalEquations
@assert Cxp ≈ CxNormalEquations
Cp = copy(C)
Cxp = copy(Cx)
normalEquationsTullio!(Cp,Cxp,x,B,A);
CNormalEquationsTullio = copy(Cp)
println(norm(CNormalEquationsTullio-CNormal))
println(norm(CNormalEquationsTullio-CNormalEquations))
CxNormalEquationsTullio = copy(Cxp)
println(norm(CxNormalEquationsTullio-CxNormal))
println(norm(CxNormalEquationsTullio-CxNormalEquations))
Cp = copy(C)
Cxp = copy(Cx)
normalEquationsMatlab(Cp,Cxp,A,B,x);
@assert Cp ≈ CNormalEquations
@assert Cxp ≈ CxNormalEquations
Cp = copy(C)
Cxp = copy(Cx)
normalEquationsMatlabJulia(Cp,Cxp,A,B,x);
@assert Cp ≈ CNormalEquations
@assert Cxp ≈ CxNormalEquations
@btime normal!(copy($C),copy($Cx),$x,$A,$B_r);
@btime normalTurbo!(copy($C),copy($Cx),$x,$A,$B);
@btime normalTturbo!(copy($C),copy($Cx),$x,$A,$B);
@btime normalEquations!(copy($C),copy($Cx),$x,$B_r,$A);
@btime normalEquationsP!(copy($C),copy($Cx),$x,$B_r,$A);
@btime normalEquationsTullio!(copy($C),copy($Cx),$x,$B_r,$A);
@btime normalEquationsMatlab(copy($C),copy($Cx),$A,$B,$x);
@btime normalEquationsMatlabJulia(copy($C),copy($Cx),$A,$B,$x);
resulting in
3.404 s (20004 allocations: 5.81 MiB)
1.184 s (20004 allocations: 5.81 MiB)
1.179 s (20004 allocations: 5.81 MiB)
385.193 ms (8 allocations: 4.90 MiB)
479.380 ms (8 allocations: 4.90 MiB)
99.472 ms (156 allocations: 4.90 MiB)
182.132 ms (19 allocations: 197.54 MiB)
209.968 ms (13 allocations: 192.88 MiB)