Does eigen( ) get stuck in processing of the results?

I wrote the below code (originally in MATLAB, ported to Julia). Unfortunately, the code runs at pretty much the same speed as the MATLAB original.

using LinearAlgebra
using StaticArrays
using DelimitedFiles
using BenchmarkTools
using Profile
using Plots


# --- Bond transform matrix
function BondMatrix2D(d)
    b = [cosd(d)     cosd(90 - d);
         cosd(90 + d) cosd(d)     ];
    M1 = b.^2;
    M2 = 2 * b[:, 2] .* b[:, 1];
    M4 = b[1,1]*b[2,2] + b[1,2]*b[2,1]
    M = [M1    M2;
        -M2'/2 M4]
    return M
end

# --- Decompose the C matrix ; as this is required repeatedly in AFmatrix and ds3ds1
function CMatrixDecomp(C)
    DD = @views (C[2,3]^2-C[2,2]*C[3,3]);
    CzziCzx =(@views[-(C[1,3]*C[2,2]-C[1,2]*C[2,3])/DD 1;
                                 (C[1,3]*C[2,3]-C[1,2]*C[3,3])/DD 0]);
    CxzCzzi = (@views [-(C[1,3]*C[2,2]-C[1,2]*C[2,3])/DD (C[1,3]*C[2,3]-C[1,2]*C[3,3])/DD;
                       1                              0]);

    CxxmCxzCzziCzx = (@views [(C[1,3]^2*C[2,2]-2*C[1,2]*C[1,3]*C[2,3]+C[1,1]*C[2,3]^2+C[1,2]^2*C[3,3]-C[1,1]*C[2,2]*C[3,3])/DD 0;
                              0                    0]);
    Czzi = @views inv([C[3,3] C[2,3];
                       C[2,3] C[2,2]]);

    return CzziCzx,CxzCzzi,CxxmCxzCzziCzx,Czzi
end

# --- Creation of A (system) matrix
function AFmatrix(C,s1,rho,f,h,CzziCzx,CxzCzzi,CxxmCxzCzziCzx,Czzi)
    A1 = -s1 * CzziCzx;
    A2 = Czzi;
    A3 = rho*I - s1^2*CxxmCxzCzziCzx;
    A4 = -s1 * CxzCzzi;
    A = [ A1 A2           ;
          A3 A4];
    F = @views [ CzziCzx*h[:,1]+h[:,2] ; f + s1 * CxxmCxzCzziCzx * h[:,1] ];
    return A,F
end

# --- Computation of d(s3)/d(s1) derivative
function ds3ds1(C,D0,s1,N1,N2,CzziCzx,CxzCzzi,CxxmCxzCzziCzx,Czzi)
    # --- The differentiated system matrix A
    dAds = [-CzziCzx               zeros(2,2);
            -2*s1*CxxmCxzCzziCzx    -CxzCzzi];
    # ---
    ds3 = @views D0 \ (dAds * dropdims(D0[:,[N1 N2]], dims = (findall(size(D0[:,[N1 N2]]) .== 1)...,))); # Drop singleton dimensions, otherwise we get a [4x1x2] matrix
    ds31 = @views ds3[N1,1];
    ds32 = @views ds3[N2,2];
    return ds31,ds32
end

# --- pairwise minimum
function pairwisemin(s3,l3)
    mins,idx = findmin(abs2.( repeat( l3 , 1, 2  ) - repeat(transpose(s3),2,1) ) , dims=1 );
    idx = [idx[1][1] idx[2][1]];
    return idx;
end


function AssignEigenvaluesEigenvectors(C0,s1,rho0,f,h,s30e,s30,CzziCzx,CxzCzzi,CxxmCxzCzziCzx,Czzi)
    A0,F0   = AFmatrix(C0,s1,rho0,f,h,CzziCzx,CxzCzzi,CxxmCxzCzziCzx,Czzi);
    L0,D0   = eigen(A0);
    posneg0 = sign.(imag( L0./s1 ));
    negidx  = @views pairwisemin( s30e[1:2], L0[ posneg0.==+1 ] );
    posidx  = @views pairwisemin( s30e[3:4], L0[ posneg0.==-1 ] );
    posneg0[posneg0.==-1]=2 .+ vec(posidx);
    posneg0[posneg0.==+1]=     vec(negidx);
    posneg0I= round.(Int, posneg0);
    s30[ posneg0I] = (L0);
    D0[:,posneg0I] = (D0);
    return s30,D0,F0
end





function CdH()

theta = 0;
M = BondMatrix2D(22.5)
C0 = [64 18 -4;
      18 46 -2;
      -4 -2 7 ]*3.1e8;
C0 = M * C0 * M';
rho0 = 2000;
C1 = 4.5*C0;
rho1 = 1.5*rho0;
# --- Decompose the matrix
CzziCzx0,CxzCzzi0,CxxmCxzCzziCzx0,Czzi0 = CMatrixDecomp(C0);
CzziCzx1,CxzCzzi1,CxxmCxzCzziCzx1,Czzi1 = CMatrixDecomp(C1);

# -------------------- Setup source details
f = [1;0]
h = [0 0; 0 0];
# -------------------- Setup receiver details
h0 = 200;
h1 = -300;
x  = 10;
N0 =  [3 3 4 4]; # P/ P/ S/ S/
N1 =  [1 2 1 2]; # P\ S\ P\ S\
# -------------------- Setup temporal vector
ts = collect(0:2e-5:0.8);
solution1 = ts.*0;
# -------------------- Setup initial iteration details
A0,F0 = AFmatrix(C0,0,rho0,f,h,CzziCzx0,CxzCzzi0,CxxmCxzCzziCzx0,Czzi0)
L0,D0 = eigen(A0);
L0init  = [-sort( -L0[ L0.<0 ] ); sort( L0[ L0.>0 ])]; # >> Sort as [-a1, -a2, a1, a2], with a1<a2.

A1,F1 = AFmatrix(C1,0,rho1,f,h,CzziCzx1,CxzCzzi1,CxxmCxzCzziCzx1,Czzi1)
L1,D1 = eigen(A1);
L1init  = [-sort( -L1[ L1.<0 ] ); sort( L1[ L1.>0 ])]; # >> Sort as [-a1, -a2, a1, a2], with a1<a2.

# --- Julia-specific initializing of variables, which have to survive in global scope
Fp = 1+im;

# -------------------- Start iterating
for N = 1:4
    println(N)
    s1new= complex(0);
    s1   = complex(0);
    s30c = complex(L0init);
    s31c = complex(L1init);
    s30  = complex(L0init);
    s31  = complex(L1init);
    for ii=1:length(ts)
        t=ts[ii];
        if t< ( L0init[N0[N]]*h0+L0init[N1[N]]*h1 ) # = where s_1 = 0.
            continue;
        end
        s30h = complex(s30c);
        s31h = complex(s31c);
        s30c = complex(s30);
        s31c = complex(s31);
        s30e = complex(2*s30c - s30h);    # Expected s3 (eigen)values (medium 1)
        s31e = complex(2*s31c - s31h);    # Expected s3 (eigen)values (medium 2)
        jj=0;
        while (abs(s1*x+s30[N0[N]]*h0+s30[N1[N]]*h1-t )>1e-10 || jj==0) && (jj<500)
            s1              = real(s1new) + im*(abs(imag(s1new)));
            # --- Assign eigenvalues and eigenvectors to medium 1
            s30,D0,F0       = AssignEigenvaluesEigenvectors(C0,s1+im*eps(),rho0,f,h,s30e,s30e,CzziCzx0,CxzCzzi0,CxxmCxzCzziCzx0,Czzi0);
            # --- Newton-Raphson iteration
            ds3ds11,ds3ds12 = ds3ds1(C0,D0,s1,N0[N],N1[N],CzziCzx0,CxzCzzi0,CxxmCxzCzziCzx0,Czzi0);
            Fp              = ( x + ds3ds11 * h0 + ds3ds12 * h1 );
            s1new           = s1 - ( s1*x + s30[N0[N]]*h0 + s30[N1[N]]*h1-t ) / Fp;
            jj+=1;
        end
        s31,D1,F1 = AssignEigenvaluesEigenvectors(C1,s1,rho1,f,h,s31e,s31e,CzziCzx1,CxzCzzi1,CxxmCxzCzziCzx1,Czzi1);
        # --- Scattering matrix
        Q   = D1 \ D0;
        R   = @views Q[1:2,1:2] \ Q[1:2,3:4];
        R   = R[N1[N],N0[N]-2]

        DF = D0 \ F0;
        DF = DF[N0[N]];
        BJN = D0[:,N1[N]] * DF * R
        # BJN = D0[:,N1[N]] .* ( transpose(D0I[N0[N],:]) * F0 ) * R ;
        dsdt = 1/Fp;

        sol = 1/pi * imag( BJN * dsdt );
        solution1[ii] += cos(theta)*sol[1] - sin(theta)*sol[2];
    end
    display(plot(ts,solution1));
end

return solution1

end

CdH(); # >> Actually running the program.

The program supplied here takes about 35 seconds to run on my MacBook, where I use the JuliaPro with the long-term support (v1.0.5-2) version.

When I check the program with the profiler, it seems that the main bottleneck of the program is the L0,D0 = eigen(A0); operation, which takes about 25% of the run-time. Here, A0 is always a 4x4 complex-valued matrix. Unfortunately, the StaticArrays package doesn’t allow for this format (or at least, it gives errors for me when I then use it with the eigen function), so I don’t know if I could make this call more efficient by somehow telling Julia about what format to expect… However, I wondered if there were other ways of increasing the speed of my code. So, I went another few levels down in the profiler (please can someone confirm that I’m reading this correctly?), and see that it spends the most time computing the eigenvectors/eigenvalues in getindex(t::Tuple, r::AbstractArray{<:Any,1}) = ([t[ri] for ri in r]...,) which lives in tuple.jl. This leads to my question: is the Julia call to Eigen somehow getting stuck in collecting the results from the call to BLAS into a tuple, rather than spending time in the BLAS function itself? Does someone have a suggestion about how I can speed up my code if that is the case?

Many thanks in advance!

Can you try the latest stable version (1.5)?

-viral

1 Like

Since you already narrowed down the problem, it would make sense to post a minimal working example on your eigenvalue problem. This makes it easier to reproduce your problem and provide help.

Okay, this is a simpler version that exhibits the same run-time properties.

using Profile
using LinearAlgebra
function test_program()
    for i = 1:100
        A0 = rand(4,4) * (1+im)*i/1000
        L0,D0   = eigen(A0);
    end
end

Profile.clear()
@profile test_program()
Juno.profiler()

(run it a few times, to get rid of compile-time performance issues).

1 Like

If the bottleneck is eigen, then it is almost certain that Matlab and Julia run the same code: Lapack + BLAS. No wonder then if they are the same speed.

4 Likes

Profiling confirms that the Lapack function geevx! is called at some point. The same holds true if you use StaticArrays, which is why there is no speed up in this case. So unless you have a specific algorithm that outperforms Lapck for complex 4x4 matrices Julia and Matlab will perform similarly.

Thanks for all your responses. I totally understand that Julia and MATLAB use the same underlying Lapack/BLAS function calls. But my question has to do with the fact that the profiler shows that the majority of time is not spent with Lapack/BLAS. I have screenshotted and annotated the profiling result for the test_program I posted above:

My understanding is that the horizontal extent of a block indicates the time spent in a particular function, and calls to sub-programs are layered vertically. Right?

What you can see is that there are some calls to lapack.jl, from which Lapack/BLAS is called. But also, you can see that the longest time is spent in tuple.jl (and calls to array.jl). To me, it gives the suggestion that my simple code spends a lot of time doing work that is NOT related to computing eigenvalues/eigenvectors, but is doing work related to input/output operations… Right? What is it doing there, because it looks like it’s spending more time on I/O than on actual eigendecompositions?

What I wonder about is whether this tuple.jl part can be kept to a minimum, when I repeat this step hundreds of times for identically sized matrices with an identical type. I have no hope of beating Lapack/BLAS with my own code, of course, but I would like my program to spend the majority of its time running lapack.jl – is there a way to make it so? I hope it’s clear what I’m trying to do…

1 Like

I tried your code and a variant and could not observe your profile issue

function test_program2()
           A0 = rand(4,4) * (1+im) * 12/1000
           for i = 1:1000
               L0,D0   = eigen(A0);
           end
       end

See the following output generated with StatProfilerHTML.

test_program
test_program2

I also tried it with much larger matrices (rand(50, 50)) and the time spent in Lapack was then 98%.

Cheers,

Felix

Is this with static arrays?

Nope, just the code given, with JuliaPro v.1.0.5-2 (long-term support).

Since you probably have the profiling output, could you provide the trace that leads to the “expensive” call in tuple.jl?