Outperformed by Matlab

goerch · November 17, 2021, 11:34am

Edit: corrected after @lmiq `s observations…

Here is a modified MWE, adding the proposals of the thread and using @aplavin’s nice trick Y.A.t.Q : Yet Another @threads Question - #13 by aplavin

using LinearAlgebra
using LoopVectorization
using Tullio
using BenchmarkTools


function normal!(C,Cx,x,A,B)
    temp = Vector{Float64}(undef,size(A,2)*size(B,2))
    # @fastmath @views @inbounds for n = 1:size(A,1)
    @views for n = 1:size(A,1)
        kron!(temp,A[n,:],B[n,:])
        C .= C.+ temp.*temp'
        Cx .= Cx .+ temp.*x[n]
    end
    return Cx
end


function normalTurbo!(C,Cx,x,A,B)
    temp = Vector{Float64}(undef,size(A,2)*size(B,2))
    # @fastmath @views @inbounds for n = 1:size(A,1)
    @views for n = 1:size(A,1)
        kron!(temp,A[n,:],B[n,:])
        @turbo C .= C.+ temp.*temp'
        @turbo Cx .= Cx .+ temp.*x[n]
    end
    return Cx
end


function normalTturbo!(C,Cx,x,A,B)
    temp = Vector{Float64}(undef,size(A,2)*size(B,2))
    # @fastmath @views @inbounds for n = 1:size(A,1)
    @views for n = 1:size(A,1)
        kron!(temp,A[n,:],B[n,:])
        @tturbo C .= C.+ temp.*temp'
        @tturbo Cx .= Cx .+ temp.*x[n]
    end
    return Cx
end


# swapped arguments
function normalEquations!(C,Cx,x,A,B)
    temp = Matrix{Float64}(undef,size(B,2),size(A,2))
    C4 = reshape(C, size(B,2),size(A,2),size(B,2),size(A,2))  # writing into original C
    C4x = reshape(Cx, size(B,2),size(A,2))
    # @fastmath @inbounds @turbo for m = 1:size(B,2)
    @turbo for m = 1:size(B,2)
        for r = 1:size(A,2)
            for mm = 1:size(B,2)
                for rr = 1:size(A,2)
                    for n = 1:size(B,1)
                        temp =  A[n,r] * B[n,m]
                        C4[m,r,mm,rr] = C4[m,r,mm,rr] + temp * A[n,rr] * B[n,mm]
                        C4x[m,r] = C4x[m,r] + temp*x[n]
                    end
                end
            end
        end
    end
    return Cx
end

# swapped arguments
function normalEquationsP!(C,Cx,x,A,B)
    temp = Matrix{Float64}(undef,size(B,2),size(A,2))
    C4 = reshape(C, size(B,2),size(A,2),size(B,2),size(A,2))  # writing into original C
    C4x = reshape(Cx, size(B,2),size(A,2))
    # @fastmath @inbounds @tturbo for m = 1:size(B,2)
    @tturbo for m = 1:size(B,2)
        for r = 1:size(A,2)
            for mm = 1:size(B,2)
                for rr = 1:size(A,2)
                    for n = 1:size(B,1)
                        temp =  A[n,r] * B[n,m]
                        C4[m,r,mm,rr] = C4[m,r,mm,rr] + temp * A[n,rr] * B[n,mm]
                        C4x[m,r] = C4x[m,r] + temp*x[n]
                    end
                end
            end
        end
    end
    return Cx
end


# swapped arguments
function normalEquationsTullio!(C,Cx,x,A,B)
    C4 = reshape(C, size(B,2),size(A,2),size(B,2),size(A,2))  # writing into original C
    Cx4 = reshape(Cx, size(B,2),size(A,2))
    @tullio C4[j,i,l,k] = A[n,i] * B[n,j] * A[n,k] * B[n,l]
    @tullio Cx4[j,i] = A[n,i] * B[n,j] * x[n]
    return Cx
end


function  normalEquationsMatlab(CC,Cx,A,B,x)
    batchSize = 10000;
    for n = 1:batchSize:size(A,1)
        idx = min(n+batchSize-1,size(A,1));
        temp = repeat(A[n:idx,:],1,size(B,2)).*kron(B[n:idx,:], ones(1, size(A,2)));
        CC .= CC.+temp'*temp;
        Cx .= Cx.+temp'*x[n:idx];
    end
    return Cx
end


function  normalEquationsMatlabJulia(CC,Cx,A,B,x)
    batchSize = 10000;
    stencil = ones(1, size(A,2))
    # @fastmath @inbounds @views for n = 1:batchSize:size(A,1)
    @views for n = 1:batchSize:size(A,1)
        idx = min(n+batchSize-1,size(A,1))
        @turbo temp = repeat(A[n:idx,:],1,size(B,2)).*kron(B[n:idx,:],stencil )
        @turbo CC .= CC.+temp'*temp
        @turbo Cx .= Cx.+temp'*x[n:idx]
    end
    return Cx
end


N = 10000;
R = 20;
M = 40;


A = rand(N,M);
B = rand(N,R);
C = rand(M*R,M*R);
x = rand(N,);
Cx = rand(M*R,);

A_r = PermutedDimsArray(permutedims(A), (2, 1))
B_r = PermutedDimsArray(permutedims(B), (2, 1))

Cp = copy(C)
Cxp = copy(Cx)
normal!(Cp,Cxp,x,A,B_r);
CNormal = copy(Cp)
CxNormal = copy(Cxp)

Cp = copy(C)
Cxp = copy(Cx)
normalTurbo!(Cp,Cxp,x,A,B);
@assert Cp ≈ CNormal
@assert Cxp ≈ CxNormal

Cp = copy(C)
Cxp = copy(Cx)
normalTturbo!(Cp,Cxp,x,A,B);
@assert Cp ≈ CNormal
@assert Cxp ≈ CxNormal

Cp = copy(C)
Cxp = copy(Cx)
normalEquations!(Cp,Cxp,x,B_r,A);
CNormalEquations = copy(Cp)
println(norm(CNormalEquations-CNormal))
CxNormalEquations = copy(Cxp)
println(norm(CxNormalEquations-CxNormal))

Cp = copy(C)
Cxp = copy(Cx)
normalEquationsP!(Cp,Cxp,x,B,A);
@assert Cp ≈ CNormalEquations
@assert Cxp ≈ CxNormalEquations

Cp = copy(C)
Cxp = copy(Cx)
normalEquationsTullio!(Cp,Cxp,x,B,A);
CNormalEquationsTullio = copy(Cp)
println(norm(CNormalEquationsTullio-CNormal))
println(norm(CNormalEquationsTullio-CNormalEquations))
CxNormalEquationsTullio = copy(Cxp)
println(norm(CxNormalEquationsTullio-CxNormal))
println(norm(CxNormalEquationsTullio-CxNormalEquations))

Cp = copy(C)
Cxp = copy(Cx)
normalEquationsMatlab(Cp,Cxp,A,B,x);
@assert Cp ≈ CNormalEquations
@assert Cxp ≈ CxNormalEquations

Cp = copy(C)
Cxp = copy(Cx)
normalEquationsMatlabJulia(Cp,Cxp,A,B,x);
@assert Cp ≈ CNormalEquations
@assert Cxp ≈ CxNormalEquations


@btime normal!(copy($C),copy($Cx),$x,$A,$B_r);

@btime normalTurbo!(copy($C),copy($Cx),$x,$A,$B);

@btime normalTturbo!(copy($C),copy($Cx),$x,$A,$B);

@btime normalEquations!(copy($C),copy($Cx),$x,$B_r,$A);

@btime normalEquationsP!(copy($C),copy($Cx),$x,$B_r,$A);

@btime normalEquationsTullio!(copy($C),copy($Cx),$x,$B_r,$A);

@btime normalEquationsMatlab(copy($C),copy($Cx),$A,$B,$x);

@btime normalEquationsMatlabJulia(copy($C),copy($Cx),$A,$B,$x);

resulting in

  3.404 s (20004 allocations: 5.81 MiB)
  1.184 s (20004 allocations: 5.81 MiB)
  1.179 s (20004 allocations: 5.81 MiB)
  385.193 ms (8 allocations: 4.90 MiB)
  479.380 ms (8 allocations: 4.90 MiB)
  99.472 ms (156 allocations: 4.90 MiB)
  182.132 ms (19 allocations: 197.54 MiB)
  209.968 ms (13 allocations: 192.88 MiB)

Topic		Replies	Views
Speed up simple product accumulator loop Performance loops , tullio	6	846	August 17, 2020
Fast computation of row-wise Kronecker product (Khatri-Rao product) Performance question , performance	5	1235	April 8, 2021
Speed comparison matrix multiplication in Julia Performance question , linearalgebra , optimization , tullio	45	3192	August 19, 2021
Speeding up Matrix multiplication involving dot and hadamard product New to Julia question , performance , vector , matrices , loopvectorization	11	1593	February 9, 2022
Simple Mat-Vec multiply (understanding performance, without the bugs) Performance tullio	16	3267	August 12, 2020

Outperformed by Matlab

Related topics