How to improve the scaling of Julia code aimed at multi-node parallelization?

I have been trying to write a version of my Julia code that can be run in a parallelized manner using multiple nodes at once. After posting this thread I finally have a working version of the code (as given below)

Code that uses Distributed.jl
using Distributed
@everywhere begin
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using ChunkSplitters
BLAS.set_num_threads(1)
end
const to = TimerOutput()
println("Number of processes = $(Distributed.nworkers())")
open("ISC-DISTRIBUTED.TXT", "a") do f
    write(f, "\n\n\n====================================================\n\n\n")
end

@everywhere function GDSO(
    t::Float64,
    b::Float64,
    Ξ”::Float64,
    sysize::Int64,
    Ξ©f::Diagonal{Float64},
    Ξ©i::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    ρ::Diagonal{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64}
)
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
        Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
        Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
        Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    end
    BB = (Bf + J' * Bi * J)
    AA = (Sf + J' * Si * J)

    W = [BB -AA
        -AA BB] #We wont calculate the 2Nx2N determinant directly.
        Wapp = BB * (BB - AA * (BB \ AA))
        # U = Bi - Si
        V = [J' * (Bi-Si) * D
            J' * (Bi-Si) * D]
        F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
        F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
        g = F1 * exp(F2)
        Ξ” = Ξ” * 0.0367493  # in atomic units
        g = g * exp(im * Ξ” * t)
        Ξ· = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
        g = g * exp(-Ξ· * t^2) 
    return g[1]
end

@everywhere @views function GSV(
    t::Float64,
    b::Float64,
    sysize::Int64,
    Ξ©f::Diagonal{Float64},
    Ξ©i::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    gdsopart::ComplexF64,
    T::Matrix{Float64},       ## Product if NAC and SO 
    Hso::Float64,
    X::Matrix{ComplexF64},
    Y::Matrix{ComplexF64}
)

    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
        Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
        Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
        Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    end
    W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
        -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
    # U = Bi - Si
    V = [J' * (Bi-Si) * D
        J' * (Bi-Si) * D]

    ########## Defining the X and Y Matrices ##################################


    ################# Final Calculation and returning value ##################
    s = zero(ComplexF64)
    MJSJ = J' * Si * J
    MJBJ = J' * Bi * J
    MJUD = J' * (Bi-Si) * D
    Win = inv(W)
    WinV = Win * V
    XWin = similar(Win)
    for m in 1:sysize
        for mp in 1:sysize

            X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m])
            X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m])
            X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m])
            X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m])

            Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1]
            Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1]

            g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
            s = s + g * gdsopart * T[m, mp]

        end
        X[m, 1:sysize] .= 0
        X[m, sysize+1:2*sysize] .= 0
        X[sysize+m, 1:sysize] .= 0
        X[sysize+m, sysize+1:2*sysize] .= 0
        Y[m, 1] = 0
        Y[m+sysize, 1] = 0
    end
    s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
    return s
end
@everywhere begin
const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 41                           # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17         # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25
end

@everywhere function worker_function(xrange, t, b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso)
    Sf = Diagonal(zeros(ComplexF64, sysize))
    Si = Diagonal(zeros(ComplexF64, sysize))
    Bf = Diagonal(zeros(ComplexF64, sysize))
    Bi = Diagonal(zeros(ComplexF64, sysize))
    X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Y = zeros(ComplexF64, 2 * sysize, 1)
    x_worker = zeros(ComplexF64, length(xrange))
    for (local_xind,main_xind) in enumerate(xrange)
        gdsopart = GDSO(t[main_xind], b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P, Si, Sf, Bi, Bf)
        gtotal = GSV(t[main_xind], b, sysize, Ξ©f, Ξ©i, J, D, Si, Sf, Bi, Bf, gdsopart, T, Hso, X, Y)
        x_worker[local_xind] = gtotal
        open("ISC-DISTRIBUTED.TXT", "a") do f
            write(f, "ITERATION $main_xind COMPLETED BY PROCESS $(Distributed.myid()) \n")
        end
    end
    return x_worker
end

function calc(t, b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P; nchunks=Distributed.nworkers())
    open("ISC-DISTRIBUTED.TXT", "w") do f
        write(f, "STARTING LOOP WITH NUMBER OF CORES = $(Distributed.nworkers()) \n====================================================\n")
    end
    x = zeros(ComplexF64, dd)
    Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
    xrange_chunknum = chunks(range(1, dd), nchunks)
    @timeit to "GT Calculation" x = @distributed vcat for ichunk in 1:nchunks
        xrange,chunknum = xrange_chunknum[ichunk]
        worker_function(xrange, t, b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso)
    end
    open("ISC-DISTRIBUTED.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t, yr, linewidth=2, color="blue", title="Ξ” = $Ξ”", label="Real part", grid=false)
    p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
    savefig(p, "gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1] - t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p * signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558e-15
        F = F * 6.57e15
        time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
        p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
        savefig(p, "gt-fft.svg")
        ee = Ξ”# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ", abs.(F)[index])
        rc = abs.(F)[index]
    end
#################################################This part writes the output to a file#######################################################
    begin 
        open("ISC-DISTRIBUTED.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f, "NUMBER OF CORES = $(Distributed.nworkers())\n")
            write(f, "The value of Ξ” is $Ξ”\n")
            write(f, "Number of time points = $dd\n")
            write(f, "The rate constant calculated from Julia FFTW = $rc\n")
            write(f, "-------------------------\n")
            write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

        open("GT-VALUES-DISTRIBTUED.txt", "w") do f
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

    end

end


calc(t, b, Delta, sysize, omegaf, omegai, J, D, P; nchunks=Distributed.nworkers())
display(to)

The aim is to develop a version of this code that runs faster than the Fortran equivalent. Keeping in mind the fact that the serial version of our code is already faster than the Fortran equivalent (see this thread), now the next thing to do is to make it capable of multi-node parallelization that the Fortran code achieves using MPI.

Previously, we have seen that the version of the code that uses Base.Threads outperforms the MPI parallelized Fortran code when ran on a single node. But as you might already be about to point out, that is not a fair comparison since the threaded version cannot achieve multi node parallelization. For this the version using Distributed.jl was developed. But it seems that the scaling right now is pretty bad.

|Code Version           | Number of Cores            | Time Taken |
-------------------------------------------------------------------
|Julia [Base.Threads]   | 40                         | 39hrs      |
|Julia [Distributed.jl] | 80 [40 + 40 using 2 nodes] | 31hrs      |
|Fortran [Using MPI]    | 80 [40 + 40 using 2 nodes] | <20 hrs    |

I am aware that the Benchmarks provided are not ideal, it is not fair to expect the Distributed.jl version of the code (when using 80 total cores) to complete the task in half the amount of time taken by the Base.Threads version (which was using 40 cores). As you can understand, the times are very long and hence not too many tests could be ran on the cluster I am using. Is the bad scaling in question a consequence of some bad code design I have implemented ? Or does it reflect the difference in speeds of MPI and Distributed.jl, do I have to use MPI.jl to get comparable speeds to this? Any advice on how to improve the scaling is greatly appreciated.

2 Likes

Likely what holds you back the most are all the allocations. Allocations are notorious for killing scaling in parallelization.
See the short snippet below for example:
A rough estimate says that you allocate ~18 new matrices. And this is neither the full code nor a careful analysis.

I commented about his before but you didn’t want to change the structure too much, so it might be hard to optimize further.

Are the parameters realistic in the snippet above? If they are you might need to rethink your parallelization strategy alltogether because there are only 41 (determined by dd) jobs so having 80 workers does not benefit you.

The value of dd used was 20001, the snippet’s dd=41 reflects a smaller benchmark that I was using to get a first estimate. And if the problem is the structure of the code – which uses smaller matrices to define large matrices causing more allocations. Shouldn’t the Fortran code also be affected by that ? Quoting the innermost loop of the Fortran code (the link to which I have provided)

       NP1=MYID+1                    !MYID = PROCESSOR ID, NP1=TEMPORARY PROCESSOR ID
       DO K=NP1,N,NP
       DO M=1,N
             DO I=1,N
               DO J=1,N
                IF(I.EQ.K)THEN
                G11(I,J)=-BS_COMPLEX(K,K)*X2(M,J)
                G12(I,J)=BS_COMPLEX(K,K)*X4(M,J)
                G21(I,J)=AS_COMPLEX(K,K)*X2(M,J)
                G22(I,J)=-AS_COMPLEX(K,K)*X4(M,J)
                ELSE
                G11(I,J)=(0.0D0,0.0D0)
                G12(I,J)=(0.0D0,0.0D0)
                G21(I,J)=(0.0D0,0.0D0)
                G22(I,J)=(0.0D0,0.0D0)
                ENDIF
               ENDDO
               IF(I.EQ.K)THEN
               H1(I,1)=BS_COMPLEX(K,K)*V1(M,1)
               H2(I,1)=-AS_COMPLEX(K,K)*V1(M,1)
               ELSE
               H1(I,1)=(0.0D0,0.0D0)
               H2(I,1)=(0.0D0,0.0D0)
               ENDIF
              ENDDO
         G(1:N,1:N)=G11
         G(1:N,N+1:N1)=G12
         G(N+1:N1,1:N)=G21
         G(N+1:N1,N+1:N1)=G22
         H(1:N,1:N)=H1
         H(N+1:N1,1:N)=H2

!TRANSPOSE OF THE H VECTOR

         DO I=1,N1
           H_TRANS(1,I)=H(I,1)
        ENDDO

!MATRIX MULTIPLICATION FOR THE SPIN-VIBRONIC PART

        !THIS IS FOR THE Tr(GK^(-1))+ [(K^(-1)F)^(T)G(K^(-1)F)]-H^(T)K^(-1)F


     CALL ZGEMM('N1','N1',N1,N1,N1,(1.0D0,0.0D0),G,N1,AK_INV,N1, &
              (0.0D0,0.0D0),GK_INV,N1)
      CALL ZGEMM('N1','N1',N1,1,N1,(1.0D0,0.0D0),AK_INV,N1,F,N1, &
              (0.0D0,0.0D0),AKINV_F,N1)


      DO I=1,N1
       AKINV_F_TRANS(1,I)=AKINV_F(I,1)   !TRANSPOSE OF AKINV_F i.e. K^(-1)F
      ENDDO


      CALL ZGEMM('N1','N1',1,N1,N1,(1.0D0,0.0D0),AKINV_F_TRANS,1,G,N1, &
              (0.0D0,0.0D0),AKINV_T_G,1)
      CALL ZGEMM('N1','N1',1,1,N1,(1.0D0,0.0D0),AKINV_T_G,1,AKINV_F, &
              N1,(0.0D0,0.0D0),NUM3,1)
      CALL ZGEMM('N1','N1',1,1,N1,(1.0D0,0.0D0),H_TRANS,1,AKINV_F,N1, &
              (0.0D0,0.0D0),NUM4,1)



      TRACE_GK_INV=(0.0D0,0.0D0)


      DO I=1,N1
        DO J=1,N1
        IF(I.EQ.J)THEN
        TRACE_GK_INV=TRACE_GK_INV+(GK_INV(I,J))
        ELSE
        TRACE_GK_INV=TRACE_GK_INV+(0.0D0,0.0D0)
        ENDIF
        ENDDO
      ENDDO

      TRACE_GK_INV_REAL=REAL(TRACE_GK_INV) 
      TRACE_GK_INV_IMAG=AIMAG(TRACE_GK_INV) 

      TRACE=COMPLEX(-TRACE_GK_INV_IMAG,TRACE_GK_INV_REAL)

      !TOTAL SPIN-VIBRONIC PART 

      SV=SV+((TRACE+NUM3(1,1)-NUM4(1,1))*TKK_COMPLEX(K,M))
    

      ENDDO   !END LOOP FOR K 
      ENDDO   !END LOOP FOR M

This also seems to use things like splicing and using smaller matrices to repeatedly define large matrices. Forgive me if my interpretation of the Fortran code is wrong, I am totally unfamiliar with Fortran.

And if the allocations are what is holding me back, can’t I just pass scratch spaces into the function with the same name BB or AA and change the BB = into BB .= ? wouldn’t that now force the program to reuse the memory instead of allocating fresh ? The same problem can also be circumvented by using the @tensor macro right ?

Edit 1 – I tried to reduce the allocations by using in-place multiplication provided by LinearAlgebra. But the code seems to become slower (?) The allocations decrease as expected. Here are the results

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia serial.jl 
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18966679349
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:         66.2s /  84.9%           1.51GiB /  27.8%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT calculation        1    56.2s  100.0%   56.2s    429MiB  100.0%   429MiB
 ───────────────────────────────────────────────────────────────────────────
anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl 
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18966679349
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:         70.5s /  84.6%           1.58GiB /  25.1%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT calculation        1    59.6s  100.0%   59.6s    407MiB  100.0%   407MiB
 ───────────────────────────────────────────────────────────────────────────

The second one uses in-place multiplication and reuses memory in the part that you highlighted. Here is the modified GDSO function

function GDSO(
    t::Float64,
    b::Float64,
    Ξ”::Float64,
    sysize::Int64,
    Ξ©f::Diagonal{Float64},
    Ξ©i::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    ρ::Diagonal{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    JpBi::Matrix{ComplexF64},
    JpSi::Matrix{ComplexF64},
    BB::Matrix{ComplexF64},
    AA::Matrix{ComplexF64},
    W::Matrix{ComplexF64},
    Wapp::Matrix{ComplexF64},
    Wapp_i1::Matrix{ComplexF64}
)
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
        Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
        Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
        Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    end
    # BB = (Bf + J' * Bi * J)
    # AA = (Sf + J' * Si * J)
    mul!(JpBi,J',Bi)
    mul!(BB,JpBi,J)
    BB .+= Bf
    mul!(JpSi,J',Si)
    mul!(AA,JpSi,J)
    AA .+= Sf

    # W = [BB -AA
    #     -AA BB] 
    W[1:sysize,1:sysize] .= BB
    W[1:sysize,sysize+1:2*sysize] .= -AA
    W[sysize+1:2*sysize,1:sysize] .= -AA
    W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB

    #We wont calculate the 2Nx2N determinant directly.
    # Wapp = BB * (BB - AA * (BB \ AA))
    BBinvAA = BB \ AA
    mul!(Wapp_i1,-AA,BBinvAA)
    Wapp_i1 .+= BB
    mul!(Wapp,BB,Wapp_i1)


    V = [J' * (Bi-Si) * D
        J' * (Bi-Si) * D]
    F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
    F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
    g = F1 * exp(F2)
    Ξ” = Ξ” * 0.0367493  # in atomic units
    g = g * exp(im * Ξ” * t)
    Ξ· = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
    g = g * exp(-Ξ· * t^2) 
    return g[1]
end

Please note - these benchmarks have been run with a small dd = 21 on my local machine in a serial fashion.

But as far as I understand it does not allocate new matrices for every loop. I think the matrices are allocated only once for all runs which is exactly what one should do. You can see everything the Fortran code uses at the very top of the file:

       REAL*8,DIMENSION(159,159)::RJ,RJT,OMEGAS,OMEGAT,BS,AS,AT1,TKK
       REAL*8,DIMENSION(159,1)::WS,WT,D,NAC
       REAL*8,DIMENSION(1,159)::DT,NAC_T
       COMPLEX*16,DIMENSION(159,1)::H1,H2,D_COMPLEX,V1
       COMPLEX*16,DIMENSION(1,159)::DT_COMPLEX,V3
       COMPLEX*16,DIMENSION(159,159)::A,B,AT,BT,BB,B_INV,AA,B2, &
               AA_INV,E2,E1,AB_INV,G11,G12,G21,G22,RJ_COMPLEX, &
               RJT_COMPLEX,X1,X2,X3,X4,X5,Y1,BS_COMPLEX, &
               OMEGAT_COMPLEX,AS_COMPLEX, &
               E,TKK_COMPLEX,BT2,AT2
       COMPLEX*16,DIMENSION(318,318)::AK,AK_INV,G,GK_INV
       COMPLEX*16,DIMENSION(318,1)::F,H,AKINV_F
       COMPLEX*16,DIMENSION(1,318)::FT,AKINV_F_TRANS,AKINV_T_G, &
               H_TRANS,V2
       REAL*8::TMIN,TMAX,HH,CM_AU,S_AU,T,SOC1,AMU_AU,CONST1,AU_HZ, &
               BETA,KB,KBT,TEMP,EV_AU,PF,AMP,DET_I,DET_R, &
               DELE1,NUM1R,NUM2R,NUM1I,NUM2I,NUMR,NUMI, &
               TRACE_GK_INV_REAL,TRACE_GK_INV_IMAG,K_RISC_DSO_REAL,
               CONST2,SOC2,DELE2,THETA,T1
       COMPLEX*16::DET,TRACE_GK_INV,SV,RISC_DSO,RISC_SV,SV2, &
                   NUM,TRACE,K_RISC_DSO,K_RISC_SV,SV1
       COMPLEX*16,DIMENSION(120)::SV1
       COMPLEX*16,DIMENSION(1,1)::NUM1,NUM2,NUM3,NUM4
       INTEGER::I,J,K,M,K1,N,N1,M1,INFO,M2,NP,IERROR,MYID,NP1
       INTEGER,DIMENSION(318)::IPIV
       COMPLEX*16,DIMENSION(318)::WORK

Also you repeat these calculations inside GDSO and GSV:

    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
        Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
        Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
        Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    end
    BB = (Bf + J' * Bi * J)
    AA = (Sf + J' * Si * J)

    W = [BB -AA
        -AA BB] #We wont calculate the 2Nx2N determinant directly.
        # U = Bi - Si
        V = [J' * (Bi-Si) * D
            J' * (Bi-Si) * D]
#    Wapp = BB * (BB - AA * (BB \ AA)) # this is unique to GDSO

In this code block you should also avoid repeated calculation. You did fix some them in GDSO but 1) there are more (see V) and you didn’t fix it in GSV.

Well as you can see you did remove 22MiB out of 429MiB which is not a really huge amount…
Why it is slower I don’t know, maybe you made it more inefficient in the process?
Note that this mul!(Wapp_i1,-AA,BBinvAA) still allocates a new matrix for -A. You need to do mul!(Wapp_i1,AA,BBinvAA,-1,0).
These lines you didn’t touch but they also allocate a lot:

    V = [J' * (Bi-Si) * D
        J' * (Bi-Si) * D]
    F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
    F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))

And GSV you didn’t touch either? Then this benchmark might also tell you that GSV might be where the code spents more time.

Yes I shall do the same improvements in the other places and report the improvement. However, all I did so far was change some multiplications into mul! thereby reusing memory. If anything this should increase the speed right? Where will this β€œinefficiency” that I supposedly created come from?

Yes in principle avoiding as much allocation as possible should speed it up. The β€œinefficiency” I referred to was the mul!(Wapp_i1,-AA,BBinvAA).

Also I am not sure if the TimerOutputs measure anything meaningful. Could you for further test just comment out the @distributed loop and run this instead:

# in calc
# ...
#    @timeit to "GT Calculation" x = @distributed vcat for ichunk in 1:nchunks
#        xrange,chunknum = xrange_chunknum[ichunk]
#        worker_function(xrange, t, b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso)
#    end
    @timeit to "GT Calculation" x = worker_function(xrange, t, b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso)
# ... rest of code

This just makes it sure that we get the correct allocation amounts.

The β€œsmall” benchmarks that I just reported were all run using the serial version (without any type of parallelization) so that we can focus on things inherent to the structure of the code. So I do think the results are meaningful. I did make the change using the 5 parameter mul! as you suggested and it decreased the allocation a bit more as expected. It still seemed to not have much effect on the speed at this scale

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl 
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18966679349
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:         66.0s /  83.2%           1.65GiB /  23.9%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT calculation        1    54.9s  100.0%   54.9s    405MiB  100.0%   405MiB
 ───────────────────────────────────────────────────────────────────────────

However, on trying to move ahead with using mul! everywhere, I have encountered this peculiar issue where replacing the V matrix initialization with something that should in principle allocate less memory as below

    # V = [J' * (Bi-Si) * D
    #      J' * (Bi-Si) * D]
    BimSi = Bi - Si
    mul!(V_int1,BimSi,D)
    mul!(V_int2,J',V_int1)
    V[1:sysize,1] .= V_int2
    V[sysize+1:2*sysize,1] .= V_int2

Now instead increases allocation (!?) with the new amount of allocations given below

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl 
Number of processes = 1
The rate constant is 42009.18966679341
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:         67.8s /  83.8%           1.70GiB /  24.8%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT calculation        1    56.8s  100.0%   56.8s    432MiB  100.0%   432MiB
 ───────────────────────────────────────────────────────────────────────────

That is very very weird indeed… And I don’t understand why tbh. The replacement you show should allocate only the BimSi matrix. Stuff like this can happen if you use some global variables but I didn’t see any. Especially sysize should be local. But maybe double check that everything you use is indeed a local variable.

However I have 2 comments:

  • V apparently has size (2sysize,1) but this is a matlabism AFAIK. In Julia you can have proper vectors so it would be better to make V have size (2sysize) instead.
  • You can save on some of those intermediate arrays by directly multiplying into the correct memory e.g. you can remove V_int2
mul!(view(V, 1:sysize)),J',V_int1)
V[sysize+1:end] .= @view V[1:sysize]

The mul! is incompatible with the data-type that view(..) outputs.

About the β€œweird” thing : I did check that initially sysize is initialized as a const global variable, but then it is passed as a parameter to calc(..) so I guess that makes it local. Below is the current working version of the code (I also removed some allocations from GSV that are already being done by GDSO and reused the variables).

using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using TensorOperations
BLAS.set_num_threads(1)
const to = TimerOutput()
println("Number of processes = 1")
open("ISC-SERIAL.TXT", "a") do f
    write(f, "\n\n\n====================================================\n\n\n")
end

@views function GDSO(
    t::Float64,
    b::Float64,
    Ξ”::Float64,
    sysize::Int64,
    Ξ©f::Diagonal{Float64},
    Ξ©i::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    ρ::Diagonal{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    JpBi::Matrix{ComplexF64},
    JpSi::Matrix{ComplexF64},
    BB::Matrix{ComplexF64},
    AA::Matrix{ComplexF64},
    W::Matrix{ComplexF64},
    Wapp::Matrix{ComplexF64},
    Wapp_i1::Matrix{ComplexF64},
    V::Matrix{ComplexF64},
    V_int1::Matrix{ComplexF64},
    V_int2::Matrix{ComplexF64}
    
)
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
        Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
        Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
        Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    end
    # BB = (Bf + J' * Bi * J)
    # AA = (Sf + J' * Si * J)
    mul!(JpBi,J',Bi)
    mul!(BB,JpBi,J)
    BB .+= Bf
    mul!(JpSi,J',Si)
    mul!(AA,JpSi,J)
    AA .+= Sf

    # W = [BB -AA
    #     -AA BB] 
    W[1:sysize,1:sysize] .= BB
    W[1:sysize,sysize+1:2*sysize] .= -AA
    W[sysize+1:2*sysize,1:sysize] .= -AA
    W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB

    #We wont calculate the 2Nx2N determinant directly.
    # Wapp = BB * (BB - AA * (BB \ AA))
    BBinvAA = BB \ AA
    mul!(Wapp_i1,AA,BBinvAA,-1,0)
    Wapp_i1 .+= BB
    mul!(Wapp,BB,Wapp_i1)

    # V = [J' * (Bi-Si) * D
    #      J' * (Bi-Si) * D]
    BimSi = Bi - Si
    mul!(V_int1,BimSi,D)
    mul!(V_int2,J',V_int1)
    V[1:sysize,1] .= V_int2
    V[sysize+1:2*sysize,1] .= V_int2

    F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
    F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
    g = F1 * exp(F2)
    Ξ” = Ξ” * 0.0367493  # in atomic units
    g = g * exp(im * Ξ” * t)
    Ξ· = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
    g = g * exp(-Ξ· * t^2) 
    return g[1]
end

@views function GSV(
    t::Float64,
    b::Float64,
    sysize::Int64,
    Ξ©f::Diagonal{Float64},
    Ξ©i::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    gdsopart::ComplexF64,
    T::Matrix{Float64},       ## Product if NAC and SO 
    Hso::Float64,
    X::Matrix{ComplexF64},
    Y::Matrix{ComplexF64},
    W::Matrix{ComplexF64},
    V::Matrix{ComplexF64}
)

    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    # for i = 1:sysize
    #     Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
    #     Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
    #     Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
    #     Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    # end
    # W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
    #     -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
    # # U = Bi - Si
    # V = [J' * (Bi-Si) * D
    #     J' * (Bi-Si) * D]




    ################# Final Calculation and returning value ##################
    s = zero(ComplexF64)
    MJSJ = J' * Si * J
    MJBJ = J' * Bi * J
    MJUD = J' * (Bi-Si) * D
    Win = inv(W)
    WinV = Win * V
    XWin = similar(Win)
    for m in 1:sysize
        for mp in 1:sysize
            ########## Defining the X and Y Matrices ##################################
            X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m])
            X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m])
            X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m])
            X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m])

            Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1]
            Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1]

            g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
            s = s + g * gdsopart * T[m, mp]

        end
        X[m, 1:sysize] .= 0
        X[m, sysize+1:2*sysize] .= 0
        X[sysize+m, 1:sysize] .= 0
        X[sysize+m, sysize+1:2*sysize] .= 0
        Y[m, 1] = 0
        Y[m+sysize, 1] = 0
    end
    s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
    return s
end

const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 22                            # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17         # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25

function calc(t, b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P)
    # @sync tells Julia to wait for any tasks started inside it
    open("ISC-SERIAL.TXT", "w") do f
        write(f, "STARTING LOOP WITH NUMBER OF CORES = 1 \n====================================================\n")
    end
    x = zeros(ComplexF64, dd)
    Sf = Diagonal(zeros(ComplexF64, sysize))
    Si = Diagonal(zeros(ComplexF64, sysize))
    Bf = Diagonal(zeros(ComplexF64, sysize))
    Bi = Diagonal(zeros(ComplexF64, sysize))
    X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Y = zeros(ComplexF64, 2 * sysize, 1)
    BB = zeros(ComplexF64, sysize, sysize)
    AA = zeros(ComplexF64, sysize, sysize)
    JpBi = zeros(ComplexF64, sysize, sysize)
    JpSi = zeros(ComplexF64, sysize, sysize)
    W = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Wapp = zeros(ComplexF64, sysize, sysize)
    Wapp_i1 = zeros(ComplexF64, sysize, sysize)
    V =  zeros(ComplexF64, 2 * sysize,1)
    V_int1 = zeros(ComplexF64, sysize,1)
    V_int2 = zeros(ComplexF64, sysize,1)
    Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
    @timeit to "GT calculation" for i in 1:dd
        gdsopart = GDSO(t[i], b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P, Si, Sf, Bi, Bf,JpBi,JpSi,BB,AA,W,Wapp,Wapp_i1,V,V_int1,V_int2)
        gtotal = GSV(t[i], b, sysize, Ξ©f, Ξ©i, J, D, Si, Sf, Bi, Bf,gdsopart, T,Hso,X,Y,W,V)
        x[i] = gtotal
        open("ISC-SERIAL.TXT", "a") do f
            write(f, "ITERATION $i COMPLETED BY PROCESS 1 \n")
        end
    end # loop
    open("ISC-SERIAL.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t, yr, linewidth=2, color="blue", title="Ξ” = $Ξ”", label="Real part", grid=false)
    p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
    savefig(p, "gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1] - t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p * signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558e-15
        F = F * 6.57e15
        time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
        p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
        savefig(p, "gt-fft.svg")
        ee = Ξ”# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ", abs.(F)[index])
        rc = abs.(F)[index]
    end
#################################################This part writes the output to a file#######################################################
    begin 
        open("ISC-SERIAL.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f, "NUMBER OF CORES = 1\n")
            write(f, "The value of Ξ” is $Ξ”\n")
            write(f, "Number of time points = $dd\n")
            write(f, "The rate constant calculated from Julia FFTW = $rc\n")
            write(f, "-------------------------\n")
            write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

        open("GT-VALUES-SERIAL.txt", "w") do f
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

    end

end


calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
display(to)

And weirdly enough, as expected the allocations have decreased further thanks to changes to GSV , the code seems to be getting even slower !! , below are the benchmarks

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl 
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.189510825505
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:         73.8s /  82.9%           1.65GiB /  23.5%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT calculation        1    61.2s  100.0%   61.2s    398MiB  100.0%   398MiB
 ───────────────────────────────────────────────────────────────────────────

Would you be interested in using MPI with Julia as well via MPI.jl?

1 Like

No?

julia> using LinearAlgebra

julia> M = rand(10,10); V=rand(10); X=zeros(20);

julia> mul!(view(X,1:10), M, V)
10-element view(::Vector{Float64}, 1:10) with eltype Float64:
 3.0683134866939032
 2.227376927237792
 2.013449475616838
 1.8554901380575193
 1.606656183847676
 2.014155104694141
 2.882426788235068
 2.300075183314828
 2.2319169643448094
 1.7444395747531267

Something is very very wrong. The allocations did decrease only very very little! You started with 412MiB or so and now it’s still 398MiB. You should try to profile more in detail. Try littering the GDSO and GSV functions with more @timeit statements, so we can try and locate the source of these allocations.

1 Like

Yes I am in the process of doing the same. One thing we can be sure of is that almost all the allocation is coming from the GSV part as shown by the extra @timeit I have added on the function calls

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl 
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18862179286
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:         69.5s /  82.9%           1.61GiB /  22.5%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT calculation        1    57.6s  100.0%   57.6s    370MiB  100.0%   370MiB
   GSV                22    57.6s  100.0%   2.62s    347MiB   93.7%  15.8MiB
   GDSO               22   25.5ms    0.0%  1.16ms   23.3MiB    6.3%  1.06MiB
 ───────────────────────────────────────────────────────────────────────────

Another thing that might be important is that the calculated value changes slightly on reusing the W and V variables. The reused version reports (time, real_part, imaginary_part)

1.0e-24  6.394092795554566e-12  4.2656647577696985e-16

Whereas the older version reports

1.0e-24  6.394092636498152e-12  4.265664651939934e-16

But they should be exactly the same right?

The version of the code that reuses some variables set by GDSO in GSV is given below

using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using TensorOperations
BLAS.set_num_threads(1)
const to = TimerOutput()
println("Number of processes = 1")
open("ISC-SERIAL.TXT", "a") do f
    write(f, "\n\n\n====================================================\n\n\n")
end

@views function GDSO(
    t::Float64,
    b::Float64,
    Ξ”::Float64,
    sysize::Int64,
    Ξ©f::Diagonal{Float64},
    Ξ©i::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    ρ::Diagonal{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    JpBi::Matrix{ComplexF64},
    JpSi::Matrix{ComplexF64},
    BB::Matrix{ComplexF64},
    AA::Matrix{ComplexF64},
    W::Matrix{ComplexF64},
    Wapp::Matrix{ComplexF64},
    Wapp_i1::Matrix{ComplexF64},
    V::Matrix{ComplexF64},
    V_int1::Matrix{ComplexF64},
    V_int2::Matrix{ComplexF64},
    BimSi::Diagonal{ComplexF64}
    
)
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
        Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
        Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
        Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    end
    # BB = (Bf + J' * Bi * J)
    # AA = (Sf + J' * Si * J)
    mul!(JpBi,J',Bi)
    mul!(BB,JpBi,J)
    BB .+= Bf
    mul!(JpSi,J',Si)
    mul!(AA,JpSi,J)
    AA .+= Sf

    # W = [BB -AA
    #     -AA BB] 
    W[1:sysize,1:sysize] .= BB
    W[1:sysize,sysize+1:2*sysize] .= -AA
    W[sysize+1:2*sysize,1:sysize] .= -AA
    W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB

    #We wont calculate the 2Nx2N determinant directly.
    # Wapp = BB * (BB - AA * (BB \ AA))
    BBinvAA = BB \ AA
    mul!(Wapp_i1,AA,BBinvAA,-1,0)
    Wapp_i1 .+= BB
    mul!(Wapp,BB,Wapp_i1)

    # V = [J' * (Bi-Si) * D
    #      J' * (Bi-Si) * D]
    BimSi .= Bi - Si
    mul!(V_int1,BimSi,D)
    mul!(V_int2,J',V_int1)
    V[1:sysize,1] .= V_int2
    V[sysize+1:2*sysize,1] .= V_int2

    F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
    F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
    g = F1 * exp(F2)
    Ξ” = Ξ” * 0.0367493  # in atomic units
    g = g * exp(im * Ξ” * t)
    Ξ· = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
    g = g * exp(-Ξ· * t^2) 
    return g[1]
end

@views function GSV(
    t::Float64,
    b::Float64,
    sysize::Int64,
    Ξ©f::Diagonal{Float64},
    Ξ©i::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    gdsopart::ComplexF64,
    T::Matrix{Float64},       ## Product if NAC and SO 
    Hso::Float64,
    X::Matrix{ComplexF64},
    Y::Matrix{ComplexF64},
    W::Matrix{ComplexF64},
    V::Matrix{ComplexF64}
)

    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t * 4.1341373335e16                     # converting time to atomic units
    # tp = -im * b - t
    # for i = 1:sysize
    #     Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
    #     Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
    #     Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
    #     Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
    # end
    # W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
    #     -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
    # # U = Bi - Si
    # V = [J' * (Bi-Si) * D
    #     J' * (Bi-Si) * D]




    ################# Final Calculation and returning value ##################
    s = zero(ComplexF64)
    MJSJ = J' * Si * J
    MJBJ = J' * Bi * J
    MJUD = J' * (Bi-Si) * D
    Win = inv(W)
    WinV = Win * V
    XWin = similar(Win)
    for m in 1:sysize
        for mp in 1:sysize
            ########## Defining the X and Y Matrices ##################################
            X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m])
            X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m])
            X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m])
            X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m])

            Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1]
            Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1]

            g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
            s = s + g * gdsopart * T[m, mp]

        end
        X[m, 1:sysize] .= 0
        X[m, sysize+1:2*sysize] .= 0
        X[sysize+m, 1:sysize] .= 0
        X[sysize+m, sysize+1:2*sysize] .= 0
        Y[m, 1] = 0
        Y[m+sysize, 1] = 0
    end
    s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
    return s
end

const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 22                            # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17         # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25

function calc(t, b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P)
    # @sync tells Julia to wait for any tasks started inside it
    open("ISC-SERIAL.TXT", "w") do f
        write(f, "STARTING LOOP WITH NUMBER OF CORES = 1 \n====================================================\n")
    end
    x = zeros(ComplexF64, dd)
    Sf = Diagonal(zeros(ComplexF64, sysize))
    Si = Diagonal(zeros(ComplexF64, sysize))
    Bf = Diagonal(zeros(ComplexF64, sysize))
    Bi = Diagonal(zeros(ComplexF64, sysize))
    X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Y = zeros(ComplexF64, 2 * sysize, 1)
    BB = zeros(ComplexF64, sysize, sysize)
    AA = zeros(ComplexF64, sysize, sysize)
    JpBi = zeros(ComplexF64, sysize, sysize)
    JpSi = zeros(ComplexF64, sysize, sysize)
    W = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Wapp = zeros(ComplexF64, sysize, sysize)
    Wapp_i1 = zeros(ComplexF64, sysize, sysize)
    V =  zeros(ComplexF64, 2 * sysize,1)
    V_int1 = zeros(ComplexF64, sysize,1)
    V_int2 = zeros(ComplexF64, sysize,1)
    BimSi = Diagonal(zeros(ComplexF64, sysize, sysize))
    Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
    @timeit to "GT calculation" for i in 1:dd
        @timeit to "GDSO" gdsopart = GDSO(t[i], b, Ξ”, sysize, Ξ©f, Ξ©i, J, D, P, Si, Sf, Bi, Bf,JpBi,JpSi,BB,AA,W,Wapp,Wapp_i1,V,V_int1,V_int2,BimSi)
        @timeit to "GSV" gtotal = GSV(t[i], b, sysize, Ξ©f, Ξ©i, J, D, Si, Sf, Bi, Bf,gdsopart, T,Hso,X,Y,W,V)
        x[i] = gtotal
        open("ISC-SERIAL.TXT", "a") do f
            write(f, "ITERATION $i COMPLETED BY PROCESS 1 \n")
        end
    end # loop
    open("ISC-SERIAL.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t, yr, linewidth=2, color="blue", title="Ξ” = $Ξ”", label="Real part", grid=false)
    p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
    savefig(p, "gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1] - t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p * signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558e-15
        F = F * 6.57e15
        time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
        p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
        savefig(p, "gt-fft.svg")
        ee = Ξ”# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ", abs.(F)[index])
        rc = abs.(F)[index]
    end
#################################################This part writes the output to a file#######################################################
    begin 
        open("ISC-SERIAL.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f, "NUMBER OF CORES = 1\n")
            write(f, "The value of Ξ” is $Ξ”\n")
            write(f, "Number of time points = $dd\n")
            write(f, "The rate constant calculated from Julia FFTW = $rc\n")
            write(f, "-------------------------\n")
            write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

        open("GT-VALUES-SERIAL.txt", "w") do f
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

    end

end


calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
display(to)

And about this

On modifying my code as follows, so as to use the view(..)

    # V = [J' * (Bi-Si) * D
    #      J' * (Bi-Si) * D]
    BimSi .= Bi - Si
    mul!(V_int1,BimSi,D)
    mul!(view(V, 1:sysize),J',V_int1)
    V[sysize+1:end] .=  V[1:sysize]
    # mul!(V_int2,J',V_int1)
    # V[1:sysize,1] .= V_int2
    # V[sysize+1:2*sysize,1] .= V_int2

I seem to be getting the following error

ERROR: LoadError: MethodError: no method matching mul!(::SubArray{ComplexF64, 1, Vector{ComplexF64}, Tuple{UnitRange{Int64}}, true}, ::Adjoint{Float64, Matrix{Float64}}, ::Matrix{ComplexF64}, ::Bool, ::Bool)

Closest candidates are:
  mul!(::AbstractVecOrMat, ::UniformScaling, ::AbstractVecOrMat, ::Number, ::Number)
   @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/uniformscaling.jl:284
  mul!(::AbstractVector, ::AbstractVecOrMat, ::AbstractVector, ::Number, ::Number)
   @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/matmul.jl:66
  mul!(::AbstractMatrix, ::AbstractVecOrMat, ::AbstractVecOrMat, ::Number, ::Number)
   @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/matmul.jl:263
  ...

It seems that your V_int1 is a matrix and then this error could mean that you tried to multiply to matrices and store the result in a vector.
A quick fix would be to use view(V, 1:sysize, 1:1) but better would be to just use vectors for things that are vectors, i.e. make V_int1 a vector instead.

If it is nothing like that, then it is possible that there is a missing dispatch in LinearAlgebra.jl which happens from time to time especially with complicated combinations of wrappers, views and the like.

Not necessarily. It might that you change up the order of floating point operation by restructering the code. That can very well lead to tiny changes in the output due to rounding.

Btw, this exchanges the order of multiplication of the matrices I think. So that could be a source where rounding could go differently and cause your final results to deviate ever so slightly.

This causes another problem as BimSi is a Diagonal data-type. In the REPL you can see

julia> V_int1 = rand(Float64,10)
10-element Vector{Float64}:
 0.41512070416026536
 0.4405220835063167
 0.25054297797334435
 0.7442274181037184
 0.5705993211669493
 0.29762828607905023
 0.9684752525026395
 0.08052415976379501
 0.49427664484057166
 0.20143404991616232

julia> BimSi = Diagonal(ones(Float64,10,10))
ERROR: UndefVarError: `Diagonal` not defined
Stacktrace:
 [1] top-level scope
   @ REPL[2]:1

julia> using LinearAlgebra

julia> BimSi = Diagonal(ones(Float64,10,10))
10Γ—10 Diagonal{Float64, Vector{Float64}}:
 1.0   β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹… 
  β‹…   1.0   β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹… 
  β‹…    β‹…   1.0   β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹… 
  β‹…    β‹…    β‹…   1.0   β‹…    β‹…    β‹…    β‹…    β‹…    β‹… 
  β‹…    β‹…    β‹…    β‹…   1.0   β‹…    β‹…    β‹…    β‹…    β‹… 
  β‹…    β‹…    β‹…    β‹…    β‹…   1.0   β‹…    β‹…    β‹…    β‹… 
  β‹…    β‹…    β‹…    β‹…    β‹…    β‹…   1.0   β‹…    β‹…    β‹… 
  β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…   1.0   β‹…    β‹… 
  β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…   1.0   β‹… 
  β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…    β‹…   1.0

julia> D = rand(Float64,10,1)
10Γ—1 Matrix{Float64}:
 0.7193882161972547
 0.4714200397295284
 0.1365427136917623
 0.8935252225472667
 0.9779577907398268
 0.5907581975550894
 0.31000619896170867
 0.03765584126275556
 0.11379442907643522
 0.9612948902973085

julia> mul!(V_int1,BimSi,D)
ERROR: MethodError: no method matching mul!(::Vector{Float64}, ::Diagonal{Float64, Vector{Float64}}, ::Matrix{Float64}, ::Bool, ::Bool)

Closest candidates are:
  mul!(::AbstractVecOrMat, ::UniformScaling, ::AbstractVecOrMat, ::Number, ::Number)
   @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/uniformscaling.jl:284
  mul!(::AbstractVector, ::Union{Bidiagonal, Diagonal, SymTridiagonal, Tridiagonal}, ::AbstractVector, ::Number, ::Number)
   @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/bidiag.jl:425
  mul!(::AbstractVector, ::AbstractVecOrMat, ::AbstractVector, ::Number, ::Number)
   @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/matmul.jl:66
  ...

And as for your comment on

Although this is true, that was not the point I was making. The output is changing slightly after I make GSV reuse the W and V matrices from GDSO simply making it reuse the matrices should not change anything from the version that does not reuse. Because – if using mul! caused the change, we would have noticed the change the moment we used mul! to define W and V in GDSO right ? But instead we only notice it when GSV is made to reuse those variables instead of redefining them.

The answer to that is likely yes. MPI provides looks of capabilities that Distributed.jl currently doesn’t take advantage off. Instead of straight up using Distributed.jl using Dagger.jl might help.

Generally speaking the question that I would have for you is: Are you using the same parallelization strategy between MPI and Distributed. MPI often encourages a spatial-decomposition that lends itself to scaling.

While that is true for.muli-threading (and less so than it used to be, due to the parallel GC). Allocations have much less of an impact in distributed settings and are likely a red herring here.

1 Like

D is a matrix as you initialized it with

julia> D = rand(Float64,10,1)
10Γ—1 Matrix{Float64}: # it says Matrix because it has 2 dimensions!

Just make it a proper vector:

julia> D = rand(Float64,10)
10-element Vector{Float64}:

then the mul! works perfectly fine.

I see. Well I don’t see why that should make change. The way you computed W and V in GSV and GDSO seems identical to me.
However you did change the computation of W and V to use mul! in GDSO (which you claim did not change results) and then transferred these possibly slightly different W and V to GSV. So I think you just slightly changed the order of floating point operations which had a slight effect on the results (of order machine epsilon). I don’t think you need to worry.

1 Like

Yes that worked! The results are a bit different (after 5th decimal place) as before, but you said that is not something that we need to worry about. The allocations also decreased further as expected

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl 
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.188313308856
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:         67.5s /  89.0%           1.26GiB /  28.0%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT calculation        1    60.1s  100.0%   60.1s    361MiB  100.0%   361MiB
   GSV                22    60.0s  100.0%   2.73s    338MiB   93.6%  15.3MiB
   GDSO               22   27.4ms    0.0%  1.25ms   22.9MiB    6.4%  1.04MiB
 ───────────────────────────────────────────────────────────────────────────

But the speed seems to be getting slower and slower on decreasing allocations. I have no idea why at this point.

I will try to get an implementation that uses Dagger.jl instead, however I am completely unfamiliar with the package and thus I do not know how much code-change would that warrant.

Also the official Dagger.jl page reports that they have some β€œThread safety issues” when using Distributed computing.

This confuses me as well. However, we still are at 90% of the original allocations, so I don’t think we got to the bottom yet. I think we need more data, so I’d suggest that you profile more closely by putting a lot of @timeit statements inside of GSV so we can see what takes the bulk of the time and then work on optimizing that. It might turnout that the allocations are not causing the slow speed after all (especially in a multiprocess setting as pointed out by @vchuravy)