How to improve performance in a function that repeatedly defines and multiplies matrices

Uranium238 · January 5, 2024, 6:13am

I tried to incorporate the advice you have on the other thread using a naive port of your model code into my program as follows

println("Number of threads = $(Threads.nthreads())")
begin
    using LoopVectorization
    using MKL
    using DelimitedFiles
    using LinearAlgebra
    using LaTeXStrings
    using TimerOutputs
    BLAS.set_num_threads(1)
    using Plots
    using Plots.PlotMeasures
    using FFTW
    using Base.Threads
    using SharedArrays
    using ChunkSplitters
end
function GFC(t::Float64,b::Float64,Δ::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},ρ::Diagonal{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64})
    t = t* 4.1341373335*10^16                     # converting time to atomic units
    tp = - im*b - t
    for i = 1:sysize
        Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
        Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
        Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
        Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
    end
    BB = (Bf+J'*Bi*J)
    AA = (Sf+J'*Si*J)

    W = [BB  -AA
        -AA  BB] #We wont calculate the 2Nx2N determinant directly.
    
    # Wapp = BB*(BB-  AA*(BB^-1)*AA  )
    Wapp = BB*(BB-  AA*(BB\AA)  )
    U = Bi-Si
    V = [J'*U*D
        J'*U*D]
    # F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
    # F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
    F1 = sqrt(det(Si*Sf*  (Wapp\ρ^2)))
    F2 = (-(im/2)*(transpose(V)* (W\V)))    +  ((im)*(transpose(D)*U*D))
    g = F1*exp(F2)
    Δ = Δ * 0.0367493  # in atomic units
    g = g * exp(im * Δ * t)
    η = 2 * 4.55633 * 10^-6  # 10 cm^-1 to hatree energy
    #g = g * exp(-η * abs(t))     # Adding the damping factor Lorrentzian
    g = g * exp(-η * t^2)     # Adding the damping factor Gaussian
    # println("t = $t was evaluated by process $(myid())")
    # println("GFC at t = $t   is     $g")
    # println("F1 at t = $t         = $F1")
    return g[1]
end
function GHT(t::Float64,b::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64},X11::Matrix{ComplexF64},X12::Matrix{ComplexF64},X21::Matrix{ComplexF64},X22::Matrix{ComplexF64},Y1::Matrix{ComplexF64},Y2::Matrix{ComplexF64},gfcpart::ComplexF64,T::Matrix{Float64})




    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t* 4.1341373335*10^16                     # converting time to atomic units
    tp = - im*b - t
    for i = 1:sysize
        Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
        Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
        Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
        Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
    end
    W = [(Bf+J'*Bi*J)  -(Sf+J'*Si*J)
        -(Sf+J'*Si*J)  (Bf+J'*Bi*J)]
    U = Bi-Si
    V = [J'*U*D
        J'*U*D]

    ########## Defining the X and Y Matrices ##################################


    ################# Final Calculation and returning value ##################
    s=0
    MJSJ = J'*Si*J
    MJBJ = J'*Bi*J
    MJUD = J'*U*D
    for m in range(1,sysize)
        for mp in range(1,sysize)
            # @. X11 = 0.0
            # @. X12 = 0.0
            # @. X21 = 0.0
            # @. X22 = 0.0
            X11[m,:]=(MJSJ)[mp,:]*(-Bf[m,m])
            X12[m,:]=(MJBJ)[mp,:]*(Bf[m,m])
            X21[m,:]=(MJSJ)[mp,:]*(Sf[m,m])
            X22[m,:]=(MJBJ)[mp,:]*(-Sf[m,m])
            X = [X11 X12
                X21 X22]
            # @. Y1 = 0.0
            # @. Y2 = 0.0
            Y1[m,:] = Bf[m,m]*(MJUD)[mp,:] 
            Y2[m,:] = -Sf[m,m]*(MJUD)[mp,:]
            Y = [Y1
                Y2]
            g = im*tr(W\X)+    (transpose(W\V)*X*(W\V))[1]      -    (transpose(Y)*(W\V))[1]
            s = s+g[1]*gfcpart*T[m,mp]
        end
        for i in 1:sysize
            X11[m,i]=0
            X12[m,i]=0
            X21[m,i]=0
            X22[m,i]=0
            Y1[m,1]=0
            Y2[m,1]=0
        end
    end
    return s[1]
end

begin ## Define all constant variables
    const Delta::Float64 = 2.02 # in eV
    const dd::Int64 = 11                               # Change this to modify number of points
    const Temp::Int64=300
    kT = Temp * 1.380649 * 10^-23  # Boltzmann constant times temperature in Joules
    kT= kT * 2.29371227840*10^17         # in atomic units
    const b ::Float64= 1/kT
    const omegaf ::Diagonal{Float64}= Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n')))*4.55633*10^-6   # Final state  frequencies in atomic units
    const omegai ::Diagonal{Float64}= Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n')))*4.55633*10^-6  # Initial state  frequencies in atomic units
    const sysize ::Int64 = size(omegai)[1]
    const P = @. 2*sinh(b*omegai/2)
    T ::Matrix{Float64}= readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
    T = T*T';
    const D ::Matrix{Float64}= readdlm("d.txt", Float64); # Displacement Matrix
    const J ::Matrix{Float64}= readdlm("j.txt", Float64); # Duchinsky Matrix
    t ::Vector{Float64}=  collect(range(-5*10^-12,stop=5*10^-12,length=dd))
    t[div(dd+1,2)] = 10e-25
end

function calc(t,b,Δ,sysize,Ωf,Ωi,J,D,P)
    nchunks = Threads.nthreads() # define number of chunks e.g. equal to number of threads
    # @sync tells Julia to wait for any tasks started inside it
    open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
        write(f, "STARTING LOOP WITH NUMBER OF THREADS = $(Threads.nthreads()) \n====================================================\n")
    end
    @sync for (xrange, ichunk) in chunks(range(1,dd), nchunks)
        # xrange are the indices this chunk contains, ichunk is just the index of the chunk, which we don't need in this scenario
        Threads.@spawn begin # this line tells Julia to start a new thread with the contents of the begin block
            Sf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            Si ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            Bf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            Bi ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            X11 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            X12 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            X21 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            X22 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            Y1  :: Matrix{ComplexF64} =  zeros(ComplexF64,sysize,1)
            Y2  :: Matrix{ComplexF64} =  zeros(ComplexF64,sysize,1)
            x = zeros(ComplexF64,dd)
            for i in xrange
                @time for i in 1:dd
                    gfcpart = GFC(t[i],b,Δ,sysize,Ωf,Ωi,J,D,P,Si,Sf,Bi,Bf)
                    gtotal = GHT(t[i],b,sysize,Ωf,Ωi,J,D,Si,Sf,Bi,Bf,X11,X12,X21,X22,Y1,Y2,gfcpart,T)
                    x[i] = gtotal
                    open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
                        write(f, "ITERATION $i COMPLETED BY PROCESS $(Threads.threadid()) \n")
                    end
                end  
            end # loop
        end # thread work load
    end # loop over chunks, Julia will wait until all task are completed
    open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t,yr,linewidth = 2,color = "blue",title = "Δ = $Δ",label = "Real part",grid=false)
    p =plot!(t,yi,linewidth = 2,color = "red",label = "Imaginary part",size=(1000,800),xlims=(-30*10^-15,30*10^-15))
    savefig(p,"gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1]-t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p*signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558 * 10^-15
        F = F * 6.57 * 10^15
        time_domain = plot(t, signal, title = L"G(t)",legend=false,color="orange",xlabel=L"\mathrm{Time}\ (s)",ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title = L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}",legend=false,xlabel=L"\mathrm{Energy\ (eV)}",ylabel=L"\mathrm{k_{IC}\ (s^{-1})}",xlims=(0,4))
        p = plot(time_domain, freq_domain, layout = (2,1), size = (1000, 800),right_margin = 10mm)
        savefig(p,"gt-fft.svg")
        ee = Δ# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ",abs.(F)[index])
        rc = abs.(F)[index]
    end
    begin # This part writes the output to a file
        open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f,"NUMBER OF CORES = $(Threads.nthreads())\n")
            write(f,"The value of Δ is $Δ\n")
            write(f,"Number of time points = $dd\n")
            write(f, "The rate constant is $rc\n")
            write(f,"-------------------------\n")
            write(f,"The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in range(1,dd)
                write(f,"$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end
    end

end


timeneed = @elapsed calc(t,b,Delta,sysize,omegaf,omegai,J,D,P)
open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
    write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING GC + C + RC]=  $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end

I tried running it with julia -t 1 but I noticed that it seems to be going on to an infinite loop, the output (directly generated from the above code is as follow)

STARTING LOOP WITH NUMBER OF THREADS = 1 
====================================================
ITERATION 1 COMPLETED BY PROCESS 1 
ITERATION 2 COMPLETED BY PROCESS 1 
ITERATION 3 COMPLETED BY PROCESS 1 
ITERATION 4 COMPLETED BY PROCESS 1 
ITERATION 5 COMPLETED BY PROCESS 1 
ITERATION 6 COMPLETED BY PROCESS 1 
ITERATION 7 COMPLETED BY PROCESS 1 
ITERATION 8 COMPLETED BY PROCESS 1 
ITERATION 9 COMPLETED BY PROCESS 1 
ITERATION 10 COMPLETED BY PROCESS 1 
ITERATION 11 COMPLETED BY PROCESS 1 
ITERATION 1 COMPLETED BY PROCESS 1 
ITERATION 2 COMPLETED BY PROCESS 1 
ITERATION 3 COMPLETED BY PROCESS 1 
ITERATION 4 COMPLETED BY PROCESS 1 
ITERATION 5 COMPLETED BY PROCESS 1

After completion of 11 iterations it again starts from the first when I set the number of threads = 1. I also do not understand the purpose (or lack theirof) of the variable ichunk

Topic		Replies	Views
How to improve the scaling of Julia code aimed at multi-node parallelization? Julia at Scale linearalgebra , distributed	38	597	August 14, 2024
Julia code becomes slower on running on supercomputers and does not scale well when parallelizing with Base.Threads Julia at Scale fortran , parallel , linearalgebra , threads	73	2042	January 22, 2024
How to convert a thread-parallelized code into a core-parallelized code? Julia at Scale multithreading , linearalgebra , distributed , threads , matrix	3	310	May 19, 2024
Probable data race condition causing problems when trying to parallelize a loop used to populate an array Performance distributed	14	193	August 4, 2024
Speeding up the multiplying, adding, subtracting of 3D matrices Numerics question	16	730	June 24, 2023

How to improve performance in a function that repeatedly defines and multiplies matrices

Related topics