Speeding up matrix exponential and matrix multiplication

Profiling the allocations showed that the exponential! call was a significant source, so I played around with using alloc_mem and trying various algorithms listed in the documentation. It turned out that the fastest algorithm for this application is ExpMethodGeneric() which surprised me. Here is the latest version of the function:

using LinearAlgebra
using ExponentialUtilities
using ExponentialUtilities: alloc_mem

function compute3()
    N1 = 1; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

    #Hamiltonian matrix H
    h0 = [-1 0; 0 1]
    hp = [-1 0; 0 1]
    hm = [-1 0; 0 1]
    H0 = kron(Diagonal(ones(N1)), h0) + kron(diagm(1=>ones(N1-1)), hp) + kron(diagm(-1=>ones(N1-1)), hm)
    H = [H0                           zeros(ComplexF64, size(H0))
         zeros(ComplexF64, size(H0))  H0]
    #@show size(H)
    for mn = 1:N
        H[2*mn-1,2*mn] = H[2*mn,2*mn-1] = 1
    end
    H[(2*N1-1):2*N1, (2*N1+1):2*N1+2] .= [1 0; 0 -1]
    H[(2*N1+1):2*N1+2, (2*N1-1):2*N1] .= [1 0; 0 -1]
    
    #time
    tmin = -80; tmax = 280;
    Nt0 = 1500; tar0 = range(tmin,tmax,Nt0); deltat0 = tar0[2] - tar0[1]

    #Green's function matrix
    Glextt = zeros(ComplexF64, 2*N, 2*N, Nt0)
    #Temporary green's function matrix, only used for the calculation for Glextt
    Glesstt0 = zeros(ComplexF64,2*N,2*N)

    ##-----------------CALCULATION WHICH NEEDS TO BE SPED UP----------------------------
    Uu1armax = zeros(ComplexF64, 2N, 2N, Nt0)
    Siglex0max = zeros(ComplexF64, Nt0, Nt0)
    #method = ExpMethodHigham2005() # 71.087441 seconds (3.40 M allocations: 2.133 GiB, 0.26% gc time)
    #method = ExpMethodNative() # 72.297242 seconds (13.52 M allocations: 4.897 GiB, 0.64% gc time)
    #method = ExpMethodDiagonalization() # 94.820985 seconds (36.26 M allocations: 14.452 GiB, 1.21% gc time, 0.99% compilation time)
    method = ExpMethodGeneric() # 40.761433 seconds (6.78 M allocations: 3.188 GiB, 0.72% gc time)
    #method = ExpMethodHigham2005Base() # 70.791615 seconds (7.90 M allocations: 2.518 GiB, 0.31% gc time)
    cache = alloc_mem(H, method) 
    for gh = Nt0:-1:2
        #println(gh)
        Uu1ar = (@view Uu1armax[:, :, 1:gh]); Uu1ar[:, :, gh] .= I(2N)
        f1 = exp(-0.5 * deltat0)
        for hi = gh-1:-1:1
            f2 = exponential!(-im * deltat0 * H, method, cache)
            Uu1ar[:, :, hi] .= f1 * (@view Uu1ar[:, :, hi+1]) * f2
        end
        Gr = -im*Uu1ar

        Gap = conj(permutedims(-im*Uu1ar, (3, 2, 1)))

        Siglex0 = @view Siglex0max[1:gh, 1:gh] 
        for mn = 1:gh
            Siglex0[mn, :] .= -1 ./ (tar0[mn] .- tar0[1:gh] .+ im*deltat0)
        end
        for pq = 1:N    
            Glesstt0[2*pq-1, 2*pq] = deltat0^2 * tr((@view Gr[2*pq-1, :, :]) * Siglex0 * @view Gap[:, :, 2*pq])
        end
        Glextt[:, :, gh] .= Glesstt0 #G<(t,t)
    end
    ##----------------------------------------------------------------------------------
    return Glextt
end

As shown in the comments above after the method = lines, the time using ExpMethodGeneric is reduced to about 41 seconds, much less than the other methods. I also verified that the final matrix Glextt, when permuted, is nearly equal to that produced by the OP code to within about 5e-14. Perhaps @ChrisRackauckas can explain why this method is so much faster than the default ExpMethodHigham2005() method.

After profiling and identifying several loops which occupy the most time, it was apparent that the loop iterations were completely independent, so that they were parallelizable. Adding threading to these reduced the execution time significantly:

using LinearAlgebra
using ExponentialUtilities
using ExponentialUtilities: alloc_mem

function compute4()
    N1 = 1; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

    #Hamiltonian matrix H
    h0 = [-1 0; 0 1]
    hp = [-1 0; 0 1]
    hm = [-1 0; 0 1]
    H0 = kron(Diagonal(ones(N1)), h0) + kron(diagm(1=>ones(N1-1)), hp) + kron(diagm(-1=>ones(N1-1)), hm)
    H = [H0                           zeros(ComplexF64, size(H0))
         zeros(ComplexF64, size(H0))  H0]
    #@show size(H)
    for mn = 1:N
        H[2*mn-1,2*mn] = H[2*mn,2*mn-1] = 1
    end
    H[(2*N1-1):2*N1, (2*N1+1):2*N1+2] .= [1 0; 0 -1]
    H[(2*N1+1):2*N1+2, (2*N1-1):2*N1] .= [1 0; 0 -1]
    
    #time
    tmin = -80; tmax = 280;
    Nt0 = 1500; tar0 = range(tmin,tmax,Nt0); deltat0 = tar0[2] - tar0[1]

    #Green's function matrix
    Glextt = zeros(ComplexF64, 2*N, 2*N, Nt0)
    #Temporary green's function matrix, only used for the calculation for Glextt
    Glesstt0 = zeros(ComplexF64,2*N,2*N)

    ##-----------------CALCULATION WHICH NEEDS TO BE SPED UP----------------------------
    Uu1armax = zeros(ComplexF64, 2N, 2N, Nt0)
    Siglex0max = zeros(ComplexF64, Nt0, Nt0)
    method = ExpMethodGeneric() 
    cache = alloc_mem(H, method) 
    for gh = Nt0:-1:2
        #println(gh)
        Uu1ar = (@view Uu1armax[:, :, 1:gh]); Uu1ar[:, :, gh] .= I(2N)
        f1 = exp(-0.5 * deltat0)
        for hi = gh-1:-1:1
            f2 = exponential!(-im * deltat0 * H, method, cache)
            Uu1ar[:, :, hi] .= f1 * (@view Uu1ar[:, :, hi+1]) * f2
        end
        Gr = -im*Uu1ar

        Gap = conj(permutedims(-im*Uu1ar, (3, 2, 1)))

        Siglex0 = @view Siglex0max[1:gh, 1:gh] 
        Threads.@threads for mn = 1:gh
            Siglex0[mn, :] .= -1 ./ (tar0[mn] .- tar0[1:gh] .+ im*deltat0)
        end
        Threads.@threads for pq = 1:N    
            Glesstt0[2*pq-1, 2*pq] = deltat0^2 * tr((@view Gr[2*pq-1, :, :]) * Siglex0 * @view Gap[:, :, 2*pq])
        end
        Glextt[:, :, gh] .= Glesstt0 #G<(t,t)
    end
    ##----------------------------------------------------------------------------------
    return Glextt
end
julia> @time Glextt4 = compute4();
 16.489151 seconds (7.03 M allocations: 3.212 GiB, 1.84% gc time, 3.70% compilation time)

julia> norm(Glextt4 - permutedims(Glextt, (2,3,1)))
6.173225770837282e-14
2 Likes

I think one can also use threads for the outer for gh = Nt0:-1:2 loop.

I would not do that because the same cache variable would be simultaneously overwritten by all threads. Similarly for the views into Siglex0max, and there may be other similar issues.

1 Like

By the way, the option 2, which is a modification of your last code, runs faster.

using LinearAlgebra
using ExponentialUtilities
using ExponentialUtilities: alloc_mem

function computethreads()
    N1 = 14; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

    #Hamiltonian matrix H
    h0 = [-1 0; 0 1]
    hp = [-1 0; 0 1]
    hm = [-1 0; 0 1]
    H0 = kron(Diagonal(ones(N1)), h0) + kron(diagm(1=>ones(N1-1)), hp) + kron(diagm(-1=>ones(N1-1)), hm)
    H = [H0                           zeros(ComplexF64, size(H0))
         zeros(ComplexF64, size(H0))  H0]
    #@show size(H)
    for mn = 1:N
        H[2*mn-1,2*mn] = H[2*mn,2*mn-1] = 1
    end
    H[(2*N1-1):2*N1, (2*N1+1):2*N1+2] .= [1 0; 0 -1]
    H[(2*N1+1):2*N1+2, (2*N1-1):2*N1] .= [1 0; 0 -1]
    
    #time
    tmin = -80; tmax = 280;
    Nt0 = 1500; tar0 = range(tmin,tmax,Nt0); deltat0 = tar0[2] - tar0[1]

    #Green's function matrix
    Glextt = zeros(ComplexF64, 2*N, 2*N, Nt0)
    #Temporary green's function matrix, only used for the calculation for Glextt
    Glesstt0 = zeros(ComplexF64,2*N,2*N)

    ##-----------------CALCULATION WHICH NEEDS TO BE SPED UP----------------------------
    Uu1armax = zeros(ComplexF64, 2N, 2N, Nt0)
    Siglex0max = zeros(ComplexF64, Nt0, Nt0)
    Threads.@threads for mn = 1:Nt0
        Siglex0max[mn, :] .= -1 ./ (tar0[mn] .- tar0 .+ im*deltat0)
    end
    method = ExpMethodGeneric() 
    cache = alloc_mem(H, method) 
    for gh = Nt0:-1:2
        #println(gh)
        Uu1ar = (@view Uu1armax[:, :, 1:gh]); Uu1ar[:, :, gh] .= I(2N)
        f1 = exp(-0.5 * deltat0)
        for hi = gh-1:-1:1
            f2 = exponential!(-im * deltat0 * H, method, cache)
            Uu1ar[:, :, hi] .= f1 * (@view Uu1ar[:, :, hi+1]) * f2
        end

        ##---option 1---
        #Gr = -im*Uu1ar
        #Gap = conj(permutedims(-im*Uu1ar, (3, 2, 1)))

        #Siglex0 = @view Siglex0max[1:gh, 1:gh] 
        
        #t0b = time()
        #Threads.@threads for pq = 1:N    
        #    Glesstt0[2*pq-1, 2*pq] = deltat0^2 * tr((@view Gr[2*pq-1, :, :]) * Siglex0 * @view Gap[:, :, 2*pq])
        #end
        #t1b = time()
        #------
        #---option 2---
        Gr = permutedims(-im*Uu1ar, (3, 1, 2)); #t first index
        Gap = conj(permutedims(-im*Uu1ar, (3, 2, 1))) #t first index
        
        Siglex0 = @view Siglex0max[1:gh, 1:gh] 
        
        t0b = time()
        Threads.@threads for pq = 1:N    
            Glesstt0[2*pq-1, 2*pq] = deltat0^2 * tr( transpose(@view Gr[:, 2*pq-1, :]) * Siglex0 * (@view Gap[:, :, 2*pq]) ) #somehow this is faster
        end
        t1b = time()
        #------
        

        if gh>Nt0-20
            println(t1b-t0b)
        end 
        Glextt[:, :, gh] .= Glesstt0 #G<(t,t)
    end
    ##----------------------------------------------------------------------------------
    return Glextt
end

Nice! Out of curiosity, how does your Julia run time now compare to your Python code?

You arenā€™t taking advantage of the fact that H (at least in the example code) is real-symmetric. Why not just precompute F = eigen(Symmetric(real(H)) (or eigen!) and then re-use this at each timestep to compute:

expHt = F.vectors * Diagonal(cis.((-deltat0) .* F.values)) * F.vectors'

?

PS. On my machine, the built-in exp function is much faster than the ExponentialUtilities.exponential! function for the same matrix with the code above:

julia> @btime exponential!((-im * $t) * $H, $method, $cache); 
  705.917 Ī¼s (11 allocations: 245.77 KiB)

julia> @btime exp((-im * $t) * $H);
  230.667 Ī¼s (17 allocations: 344.67 KiB)

julia> @btime LinearAlgebra.exp!((-im * $t) * $H);
  225.750 Ī¼s (15 allocations: 295.62 KiB)

But exploiting the fact that H is real-symmetric is faster still.

2 Likes

In my actual project, I use a time-dependent H. So H depends on the index hi. Given that computing the eigendecomposition so many times is a lot slower. Thatā€™s why I didnā€™t bother about it.
Otherwise, fair point.

Itā€™s not slower than exponentiating a general complex matrix (which typically involves a Schur factorization).

I see. Let me try that then.

This is the comparison for N1=20.
Julia: the matrix exponential (exponential!) takes t1a-t0a~0.95s, and the matrix multiplication takes t1b-t0b~0.4s when multi-threaded (julia --threads 16), and surprisingly t1b-t0b~0.27s without calling Threads.@threads!
Python: the matrix exponential (scipy) takes t1a-t0a~0.65s, and the matrix multiplication takes t1b-t0b~0.65s.
The Julia code:

using LinearAlgebra
using ExponentialUtilities
using ExponentialUtilities: alloc_mem

function computethreads()
    N1 = 20; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

    #Hamiltonian matrix H
    h0 = [-1 0; 0 1]
    hp = [-1 0; 0 1]
    hm = [-1 0; 0 1]
    H0 = kron(Diagonal(ones(N1)), h0) + kron(diagm(1=>ones(N1-1)), hp) + kron(diagm(-1=>ones(N1-1)), hm)
    H = [H0  zeros(ComplexF64, size(H0))
         zeros(ComplexF64, size(H0))  H0]
    #@show size(H)
    for mn = 1:N
        H[2*mn-1,2*mn] = H[2*mn,2*mn-1] = 1
    end
    H[(2*N1-1):2*N1, (2*N1+1):2*N1+2] .= [1 0; 0 -1]
    H[(2*N1+1):2*N1+2, (2*N1-1):2*N1] .= [1 0; 0 -1]
    
    #time
    tmin = -80; tmax = 280;
    Nt0 = 1500; tar0 = range(tmin,tmax,Nt0); deltat0 = tar0[2] - tar0[1]

    #Green's function matrix
    Glextt = zeros(ComplexF64, 2*N, 2*N, Nt0)
    #Temporary green's function matrix, only used for the calculation for Glextt
    Glesstt0 = zeros(ComplexF64,2*N,2*N)

    ##-----------------CALCULATION WHICH NEEDS TO BE SPED UP----------------------------
    Uu1armax = zeros(ComplexF64, 2N, 2N, Nt0)
    Siglex0max = zeros(ComplexF64, Nt0, Nt0)
    Threads.@threads for mn = 1:Nt0
        Siglex0max[mn, :] .= -1 ./ (tar0[mn] .- tar0 .+ im*deltat0)
    end
    method = ExpMethodGeneric() 
    cache = alloc_mem(H, method) 
    for gh = Nt0:-1:2
        #println(gh)
        Uu1ar = (@view Uu1armax[:, :, 1:gh]); Uu1ar[:, :, gh] .= I(2N)
        f1 = exp(-0.5 * deltat0)
        t0a = time()
        for hi = gh-1:-1:1
            f2 = exponential!(-im * deltat0 * H)
            Uu1ar[:, :, hi] .= f1 * (@view Uu1ar[:, :, hi+1]) * f2
        end
        t1a = time()

        Gr = permutedims(-im*Uu1ar, (3, 1, 2)); Gap = conj(permutedims(-im*Uu1ar, (3, 2, 1))); #t first index faster than Gr = -im*Uu1ar; Gap = conj(permutedims(-im*Uu1ar, (3, 2, 1)))
        
        Siglex0 = @view Siglex0max[1:gh, 1:gh] 
        
        t0b = time()
        Threads.@threads for pq = 1:N    
            Glesstt0[2*pq-1, 2*pq] = deltat0^2 * tr( transpose(@view Gr[:, 2*pq-1, :]) * Siglex0 * (@view Gap[:, :, 2*pq]) ) #somehow this is faster than tr((@view Gr[2*pq-1, :, :]) * Siglex0 * @view Gap[:, :, 2*pq])
        end
        t1b = time()
        
        if gh>Nt0-20
            println(gh)
            println(t1a-t0a)
            println(t1b-t0b)
        end 
        Glextt[:, :, gh] .= Glesstt0 #G<(t,t)
    end
    ##----------------------------------------------------------------------------------
    return Glextt
end

Glextt = computethreads();

and the python code

import numpy as np
from scipy.linalg import expm
import time

N1 = 20; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

#Hamiltonian matrix H
h0 = np.array([[-1,0],
               [0,1]])
hp = np.array([[-1,0],
               [0,1]])
hm = np.array([[-1,0],
               [0,1]])
H0 = np.kron(np.identity(N1),h0) + np.kron(np.eye(N1,k=1),hp) + np.kron(np.eye(N1,k=-1),hm);
H = np.block([[H0, np.zeros(np.shape(H0),dtype = 'complex_')],
              [np.zeros(np.shape(H0),dtype = 'complex_'), H0]])

for mn in range(N):
    H[2*mn,2*mn+1] = 1; H[2*mn+1,2*mn] = 1

H[(2*N1-1)-1:2*N1:1, (2*N1+1)-1:2*N1+2:1] = [[1,0], [0,-1]]
H[(2*N1+1)-1:2*N1+2:1, (2*N1-1)-1:2*N1:1] = [[1,0], [0,-1]]      

#time
tmin = -80; tmax = 220 
Nt0 = 1700; tar0 = np.linspace(tmin,tmax,Nt0); deltat0 = tar0[1] - tar0[0]

#Green's function matrix
Glextt = np.zeros([Nt0,2*N,2*N],dtype = 'complex_')
#Temporary green's function matrix, only used for the calculation for Glextt
Glesstt0 = np.zeros([2*N,2*N],dtype = 'complex_')

##-----------------CALCULATION WHICH NEEDS TO BE SPED UP----------------------------
Uu1armax = np.zeros([Nt0,2*N,2*N],dtype = 'complex_')
Siglex0max = np.zeros([Nt0,Nt0],dtype = 'complex_')
for mn in range(Nt0):
    Siglex0max[mn, :] = -1 / (tar0[mn] - tar0 + 1j*deltat0);

for gh in range(Nt0-1,0,-1):
    Uu1ar = Uu1armax[0:gh,:, :]; Uu1ar[gh-1,:,:] = np.eye(2*N,dtype = 'complex_');
    f1 = np.exp(-0.5 * deltat0);
    t0a = time.time() 
    for hi in range(gh-2,-1,-1):
        f2 = expm(-1j*H*deltat0); 
        Uu1ar[hi, :, :] = f1 * Uu1ar[hi+1, :, :] * f2;
    t1a = time.time() 
    
    Gr = -1j*Uu1ar;
    Gap = np.conjugate(np.transpose(-1j*Uu1ar, (0, 2, 1)));

    Siglex0 = Siglex0max[0:gh, 0:gh];
    
    t0b = time.time() 
    for pq in range(N):    
        Glesstt0[2*pq,2*pq+1] = deltat0**2*np.trace( np.linalg.multi_dot([ np.transpose(np.squeeze(Gr[:,2*pq,:])), Siglex0, np.squeeze(Gap[:,:,2*pq+1]) ]) );
    t1b = time.time() 
    
    print(gh)
    print(t1a-t0a)
    print(t1b-t0b)

    Glextt[gh, :, :] = Glesstt0 #G<(t,t)

I see you took out the method and cache arguments to exponential!, so youā€™re using the default ExpMethodHigham2005() without caching. I confirmed that doing this is much faster than using the ExpMethodGeneric() method for this larger H matrix. I also checked using LinearAlgebra.exp! as suggested by @stevengj and found it to be much slower for this size matrix, as Iā€™m sure you did, as well. I did find exponential(-im * deltat0 * H, method, cache) with method = ExpMethodHigham2005() to be slightly faster than exponential(-im * deltat0 * H) for this size matrix. Havenā€™t had a chance yet to try @stevengj 's other suggestion.

1 Like

The issue is isolated. There are two factors:
(a) the time to evaluate a single expm ( ā€œsingle expm timeā€ )
(b) the net time in evaluating the expm as well as multiplying it with Uu1ar in the for loop ( ā€œloop expm timeā€ ).

TIMES
Julia turns out to be slower.

  1. With the given inputs, I am getting the following output printed in the terminal, which gives me the time for each section as mentioned above. I am considering only the initial prints.

Python (include("python3 Ptest.py")):

loop expm time =  0.09838080406188965
single expm time =  0.0005323886871337891

Julia (include("Jtest.jl")):

loop expm time = 0.158430814743042
single expm time = 0.0006480216979980469
  1. I have also timed them explicitly.

Python (time python3 Ptest.py):

real    0m13,124s
user    1m43,075s
sys     0m0,733s

Julia (time julia Jtest.jl):

real 0m25,008s 
user 1m29,710s 
sys 1m15,736s 

CODES:
Julia:

using LinearAlgebra
using ExponentialUtilities
using ExponentialUtilities: alloc_mem

function computethreads()
    N1 = 25; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

    #Hamiltonian matrix H
    h0 = [-1 0; 0 1]
    hp = [-1 0; 0 1]
    hm = [-1 0; 0 1]
    H0 = kron(Diagonal(ones(N1)), h0) + kron(diagm(1=>ones(N1-1)), hp) + kron(diagm(-1=>ones(N1-1)), hm)
    H = [H0  zeros(ComplexF64, size(H0))
         zeros(ComplexF64, size(H0))  H0]
    #@show size(H)
    for mn = 1:N
        H[2*mn-1,2*mn] = H[2*mn,2*mn-1] = 1
    end
    H[(2*N1-1):2*N1, (2*N1+1):2*N1+2] .= [1 0; 0 -1]
    H[(2*N1+1):2*N1+2, (2*N1-1):2*N1] .= [1 0; 0 -1]
    
    method = ExpMethodHigham2005() # fastest
    #method = ExpMethodNative() 
    #method = ExpMethodDiagonalization() 
    #method = ExpMethodGeneric() 
    #method = ExpMethodHigham2005Base() 
    cache = alloc_mem(H, method) 
    
    #time
    tmin = -20; tmax = 20;
    Nt0 = 200; tar0 = range(tmin,tmax,Nt0); deltat0 = tar0[2] - tar0[1]

    Uu1armax = zeros(ComplexF64, 2N, 2N, Nt0)
    for gh = Nt0:-1:2
        
        t0a = time()
        E1 = exponential!((-im * 0.001) * H);
        t1a = time()
        println("single expm time = ",t1a-t0a)

        Uu1ar = (@view Uu1armax[:, :, 1:gh]); Uu1ar[:, :, gh] .= I(2N)
        f1 = exp(-0.5 * deltat0)
        t0b = time()
        for hi = gh-1:-1:1
            #f2 = LinearAlgebra.exp!(-im * deltat0 * H);
            #f2 = exponential!(-im * deltat0 * H, method, cache);
            f2 = exponential!(-im * deltat0 * H);
            Uu1ar[:, :, hi] .= f1 * Uu1ar[:, :, hi+1] * f2;
        end
        t1b = time()

        println("loop expm time = ",t1b-t0b )
    end
    ##----------------------------------------------------------------------------------
end

computethreads();

Python:

import numpy as np
from scipy.linalg import expm
import time

N1 = 25; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

#Hamiltonian matrix H
h0 = np.array([[-1,0],
               [0,1]])
hp = np.array([[-1,0],
               [0,1]])
hm = np.array([[-1,0],
               [0,1]])
H0 = np.kron(np.identity(N1),h0) + np.kron(np.eye(N1,k=1),hp) + np.kron(np.eye(N1,k=-1),hm);
H = np.block([[H0, np.zeros(np.shape(H0),dtype = 'complex_')],
              [np.zeros(np.shape(H0),dtype = 'complex_'), H0]])

for mn in range(N):
    H[2*mn,2*mn+1] = 1; H[2*mn+1,2*mn] = 1

H[(2*N1-1)-1:2*N1:1, (2*N1+1)-1:2*N1+2:1] = [[1,0], [0,-1]]
H[(2*N1+1)-1:2*N1+2:1, (2*N1-1)-1:2*N1:1] = [[1,0], [0,-1]]      

#time
tmin = -20; tmax = 20 
Nt0 = 200; tar0 = np.linspace(tmin,tmax,Nt0); deltat0 = tar0[1] - tar0[0]

##-----------------single expm----------------------------

##-----------------expm in loop----------------------------
Uu1armax = np.zeros([Nt0,2*N,2*N],dtype = 'complex_')

for gh in range(Nt0-1,0,-1):
    
    t0a = time.time() 
    f2 = expm((-1j*0.001)*H); 
    t1a = time.time() 
    print("single expm time = ",t1a-t0a)

    Uu1ar = Uu1armax[0:gh,:, :]; Uu1ar[gh-1,:,:] = np.eye(2*N,dtype = 'complex_');
    f1 = np.exp(-0.5 * deltat0);
    t0b = time.time() 
    for hi in range(gh-2,-1,-1):
        f2 = expm(-1j*H*deltat0); 
        Uu1ar[hi, :, :] = f1 * Uu1ar[hi+1, :, :] * f2;
    t1b = time.time() 
    
    print("loop expm time = ",t1b-t0b)

How are you running your code exactly?

Here, I changed Nt0 to 200 (otherwise it takes too long, I hope this is meaningful, otherwise I cannot really test those codes in reasonable time). And running the julia code I got:

% time julia --project test.jl
(... lots of prints)
real	0m44,678s
user	2m1,434s
sys	0m33,069s

and with python:

% time python3 test.py
(... lots of prints)
real	3m59,851s
user	16m7,917s
sys	12m45,675s

(by the way, Nt0 values are different in both scripts, and also is `tmax).

Thanks, I have updated my previous comment. I was not explicitly timing the codes, instead I was simply reading the times printed into the terminal by the codes, which you call (ā€¦ lots of prints).
Now I have also provided my times using the result of the time command from the terminal. I have chosen N1=25 for both codes, and I now make sure that both the codes have the same Nt0.

Uhmā€¦ here the julia code is consistently much faster. What system are you in? For the exact parameters you used now I get 33s for Julia and 2 minutes 11 s for python3.

How many cores does your computer have? Julia by default sets the number of BLAS threads to be half of the number of threads (doesnā€™t do multi-threading). Here that option is the best one, but maybe in your machine it is not? (Python uses all threads).

You can use

BLAS.set_num_threads(8)

to increase the number o threads used by Julia in BLAS operations.

I see. When I was quoting the times from what was printed in the terminal by the codes (the single expm and loop expm times), I initialised REPL by julia --threads 16. and then ran include("Jtest.jl"). Isnā€™t that sufficient? In my earlier matrix multiplications, not shown here, this gave me faster times, faster than python.
I have 8 cores.

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         39 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  16
  On-line CPU(s) list:   0-15
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz
    CPU family:          6
    Model:               158
    Thread(s) per core:  2
    Core(s) per socket:  8
    Socket(s):           1
    Stepping:            13
    CPU max MHz:         4800,0000
    CPU min MHz:         800,0000

EDIT: Sorry I was looking at a previous version of the code when I made these comments. As pointed out by @lmiq below, this version is not using threading.

When you invoke Julia to run your script from the command line you should use -t 8 or -t 16 to enable multithreading in that case as well.

And in my experience, performance is usually better if you donā€™t exceed the actual number of cores with the -t option. Of course, YMMV, and it is best to test in specific cases.

No, you are not using Julia threading there. You donā€™t need to use any command line option. BLAS is run multi-threaded by default, and it will use the number of physical cores of your computer.

1 Like

I ran them again, without any prints, or any of the time commands inside the code, which may skew stuff.
TIMES:
1.) Julia (time julia Jtest.jl):

real	0m23,737s
user	1m28,217s
sys	1m9,052s

2.) Python (time python3 Ptest.jl):

real	0m11,892s
user	1m33,676s
sys	0m0,569s

CODES:
Julia:

using LinearAlgebra
using ExponentialUtilities
using ExponentialUtilities: alloc_mem


function computethreads()
    N1 = 25; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

    #Hamiltonian matrix H
    h0 = [-1 0; 0 1]
    hp = [-1 0; 0 1]
    hm = [-1 0; 0 1]
    H0 = kron(Diagonal(ones(N1)), h0) + kron(diagm(1=>ones(N1-1)), hp) + kron(diagm(-1=>ones(N1-1)), hm)
    H = [H0  zeros(ComplexF64, size(H0))
         zeros(ComplexF64, size(H0))  H0]
    #@show size(H)
    for mn = 1:N
        H[2*mn-1,2*mn] = H[2*mn,2*mn-1] = 1
    end
    H[(2*N1-1):2*N1, (2*N1+1):2*N1+2] .= [1 0; 0 -1]
    H[(2*N1+1):2*N1+2, (2*N1-1):2*N1] .= [1 0; 0 -1]
    
    #method = ExpMethodHigham2005() # fastest
    #cache = alloc_mem(H, method) 
    
    #time
    tmin = -20; tmax = 20;
    Nt0 = 200; tar0 = range(tmin,tmax,Nt0); deltat0 = tar0[2] - tar0[1]

    Uu1armax = zeros(ComplexF64, 2N, 2N, Nt0)
    for gh = Nt0:-1:2
        
        E1 = exponential!((-im * 0.001) * H);

        Uu1ar = (@view Uu1armax[:, :, 1:gh]); Uu1ar[:, :, gh] .= I(2N)
        f1 = exp(-0.5 * deltat0)
        for hi = gh-1:-1:1
            f2 = exponential!(-im * deltat0 * H);
            Uu1ar[:, :, hi] .= f1 * Uu1ar[:, :, hi+1] * f2;
        end
    end
end

computethreads();
import numpy as np
from scipy.linalg import expm
import time

N1 = 25; N = 2*N1 #size of Hamiltonian matrix (H_) is 2*N by 2*N

#Hamiltonian matrix H
h0 = np.array([[-1,0],
               [0,1]])
hp = np.array([[-1,0],
               [0,1]])
hm = np.array([[-1,0],
               [0,1]])
H0 = np.kron(np.identity(N1),h0) + np.kron(np.eye(N1,k=1),hp) + np.kron(np.eye(N1,k=-1),hm);
H = np.block([[H0, np.zeros(np.shape(H0),dtype = 'complex_')],
              [np.zeros(np.shape(H0),dtype = 'complex_'), H0]])

for mn in range(N):
    H[2*mn,2*mn+1] = 1; H[2*mn+1,2*mn] = 1

H[(2*N1-1)-1:2*N1:1, (2*N1+1)-1:2*N1+2:1] = [[1,0], [0,-1]]
H[(2*N1+1)-1:2*N1+2:1, (2*N1-1)-1:2*N1:1] = [[1,0], [0,-1]]      

#time
tmin = -20; tmax = 20 
Nt0 = 200; tar0 = np.linspace(tmin,tmax,Nt0); deltat0 = tar0[1] - tar0[0]

Uu1armax = np.zeros([Nt0,2*N,2*N],dtype = 'complex_')

for gh in range(Nt0-1,0,-1):
    
    f2 = expm((-1j*0.001)*H); 

    Uu1ar = Uu1armax[0:gh,:, :]; Uu1ar[gh-1,:,:] = np.eye(2*N,dtype = 'complex_');
    f1 = np.exp(-0.5 * deltat0);
    for hi in range(gh-2,-1,-1):
        f2 = expm(-1j*H*deltat0); 
        Uu1ar[hi, :, :] = f1 * Uu1ar[hi+1, :, :] * f2;