# How to improve the scaling of Julia code aimed at multi-node parallelization?

I have been trying to write a version of my Julia code that can be run in a parallelized manner using multiple nodes at once. After posting this thread I finally have a working version of the code (as given below)

Code that uses Distributed.jl
using Distributed
@everywhere begin
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using ChunkSplitters
end
const to = TimerOutput()
println("Number of processes = $(Distributed.nworkers())") open("ISC-DISTRIBUTED.TXT", "a") do f write(f, "\n\n\n====================================================\n\n\n") end @everywhere function GDSO( t::Float64, b::Float64, Ξ::Float64, sysize::Int64, Ξ©f::Diagonal{Float64}, Ξ©i::Diagonal{Float64}, J::Matrix{Float64}, D::Matrix{Float64}, Ο::Diagonal{Float64}, Si::Diagonal{ComplexF64}, Sf::Diagonal{ComplexF64}, Bi::Diagonal{ComplexF64}, Bf::Diagonal{ComplexF64} ) t = t * 4.1341373335e16 # converting time to atomic units tp = -im * b - t for i = 1:sysize Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t) Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp) Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t) Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp) end BB = (Bf + J' * Bi * J) AA = (Sf + J' * Si * J) W = [BB -AA -AA BB] #We wont calculate the 2Nx2N determinant directly. Wapp = BB * (BB - AA * (BB \ AA)) # U = Bi - Si V = [J' * (Bi-Si) * D J' * (Bi-Si) * D] F1 = sqrt(det(Si * Sf * (Wapp \ Ο^2))) F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D)) g = F1 * exp(F2) Ξ = Ξ * 0.0367493 # in atomic units g = g * exp(im * Ξ * t) Ξ· = 2 * 4.55633e-6 # 10 cm^-1 to hatree energy g = g * exp(-Ξ· * t^2) return g[1] end @everywhere @views function GSV( t::Float64, b::Float64, sysize::Int64, Ξ©f::Diagonal{Float64}, Ξ©i::Diagonal{Float64}, J::Matrix{Float64}, D::Matrix{Float64}, Si::Diagonal{ComplexF64}, Sf::Diagonal{ComplexF64}, Bi::Diagonal{ComplexF64}, Bf::Diagonal{ComplexF64}, gdsopart::ComplexF64, T::Matrix{Float64}, ## Product if NAC and SO Hso::Float64, X::Matrix{ComplexF64}, Y::Matrix{ComplexF64} ) ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ############################## t = t * 4.1341373335e16 # converting time to atomic units tp = -im * b - t for i = 1:sysize Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t) Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp) Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t) Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp) end W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J) -(Sf + J' * Si * J) (Bf+J'*Bi*J)] # U = Bi - Si V = [J' * (Bi-Si) * D J' * (Bi-Si) * D] ########## Defining the X and Y Matrices ################################## ################# Final Calculation and returning value ################## s = zero(ComplexF64) MJSJ = J' * Si * J MJBJ = J' * Bi * J MJUD = J' * (Bi-Si) * D Win = inv(W) WinV = Win * V XWin = similar(Win) for m in 1:sysize for mp in 1:sysize X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m]) X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m]) X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m]) X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m]) Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1] Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1] g = im * (tr(mul!(XWin, X, Win)) + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1]) s = s + g * gdsopart * T[m, mp] end X[m, 1:sysize] .= 0 X[m, sysize+1:2*sysize] .= 0 X[sysize+m, 1:sysize] .= 0 X[sysize+m, sysize+1:2*sysize] .= 0 Y[m, 1] = 0 Y[m+sysize, 1] = 0 end s = s + abs(Hso)^2 * gdsopart #### LATEST MODIFICATION return s end @everywhere begin const hso_s1t3 = 1.49 * 0.0000046 ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units const Delta = 0.11 # in eV const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units const dd = 41 # Change this to modify number of points const Temp = 300 kT_temp = Temp * 1.380649e-23 # Boltzmann constant times temperature in Joules const kT = kT_temp * 2.29371227840e17 # in atomic units const b = 1 / kT const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Final state frequencies in atomic units const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Initial state frequencies in atomic units const sysize = size(omegai)[1] const P = @. 2 * sinh(b * omegai / 2) T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2) const D = readdlm("d.txt", Float64) # Displacement Matrix const J = readdlm("j.txt", Float64) # Duchinsky Matrix const t = collect(range(-5e-12, stop=5e-12, length=dd)) t[div(dd + 1, 2)] = 10e-25 end @everywhere function worker_function(xrange, t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso) Sf = Diagonal(zeros(ComplexF64, sysize)) Si = Diagonal(zeros(ComplexF64, sysize)) Bf = Diagonal(zeros(ComplexF64, sysize)) Bi = Diagonal(zeros(ComplexF64, sysize)) X = zeros(ComplexF64, 2 * sysize, 2 * sysize) Y = zeros(ComplexF64, 2 * sysize, 1) x_worker = zeros(ComplexF64, length(xrange)) for (local_xind,main_xind) in enumerate(xrange) gdsopart = GDSO(t[main_xind], b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, Si, Sf, Bi, Bf) gtotal = GSV(t[main_xind], b, sysize, Ξ©f, Ξ©i, J, D, Si, Sf, Bi, Bf, gdsopart, T, Hso, X, Y) x_worker[local_xind] = gtotal open("ISC-DISTRIBUTED.TXT", "a") do f write(f, "ITERATION$main_xind COMPLETED BY PROCESS $(Distributed.myid()) \n") end end return x_worker end function calc(t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P; nchunks=Distributed.nworkers()) open("ISC-DISTRIBUTED.TXT", "w") do f write(f, "STARTING LOOP WITH NUMBER OF CORES =$(Distributed.nworkers()) \n====================================================\n")
end
x = zeros(ComplexF64, dd)
Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
xrange_chunknum = chunks(range(1, dd), nchunks)
@timeit to "GT Calculation" x = @distributed vcat for ichunk in 1:nchunks
xrange,chunknum = xrange_chunknum[ichunk]
worker_function(xrange, t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso)
end
open("ISC-DISTRIBUTED.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
p = plot(t, yr, linewidth=2, color="blue", title="Ξ = $Ξ", label="Real part", grid=false) p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15)) savefig(p, "gt.svg") begin # This part does FFTW and finds final rate constant fs = abs(t[1] - t[2])^-1 signal = yr p = plan_fft(signal) F = fftshift(p * signal) freqs = fftshift(fftfreq(length(t), fs)) # X axis has frequencies in s^-1 #Covert freqs to eV freqs = freqs * 4.13558e-15 F = F * 6.57e15 time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}") freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4)) p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm) savefig(p, "gt-fft.svg") ee = Ξ# * 8065.73 #Converting eV to cm^-1 index = findmin(abs.(freqs .- ee))[2] # Find the index of freq closest to ee freqs[index] println("The rate constant is ", abs.(F)[index]) rc = abs.(F)[index] end #################################################This part writes the output to a file####################################################### begin open("ISC-DISTRIBUTED.TXT", "a") do f write(f, "-------------------------\n") write(f, "NUMBER OF CORES =$(Distributed.nworkers())\n")
write(f, "The value of Ξ is $Ξ\n") write(f, "Number of time points =$dd\n")
write(f, "The rate constant calculated from Julia FFTW = $rc\n") write(f, "-------------------------\n") write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n") for i in 1:dd write(f, "$(t[i])  $(yr[i])$(yi[i])\n")
end
end

open("GT-VALUES-DISTRIBTUED.txt", "w") do f
for i in 1:dd
write(f, "$(t[i])$(yr[i])  $(yi[i])\n") end end end end calc(t, b, Delta, sysize, omegaf, omegai, J, D, P; nchunks=Distributed.nworkers()) display(to)  The aim is to develop a version of this code that runs faster than the Fortran equivalent. Keeping in mind the fact that the serial version of our code is already faster than the Fortran equivalent (see this thread), now the next thing to do is to make it capable of multi-node parallelization that the Fortran code achieves using MPI. Previously, we have seen that the version of the code that uses Base.Threads outperforms the MPI parallelized Fortran code when ran on a single node. But as you might already be about to point out, that is not a fair comparison since the threaded version cannot achieve multi node parallelization. For this the version using Distributed.jl was developed. But it seems that the scaling right now is pretty bad. |Code Version | Number of Cores | Time Taken | ------------------------------------------------------------------- |Julia [Base.Threads] | 40 | 39hrs | |Julia [Distributed.jl] | 80 [40 + 40 using 2 nodes] | 31hrs | |Fortran [Using MPI] | 80 [40 + 40 using 2 nodes] | <20 hrs |  I am aware that the Benchmarks provided are not ideal, it is not fair to expect the Distributed.jl version of the code (when using 80 total cores) to complete the task in half the amount of time taken by the Base.Threads version (which was using 40 cores). As you can understand, the times are very long and hence not too many tests could be ran on the cluster I am using. Is the bad scaling in question a consequence of some bad code design I have implemented ? Or does it reflect the difference in speeds of MPI and Distributed.jl, do I have to use MPI.jl to get comparable speeds to this? Any advice on how to improve the scaling is greatly appreciated. 2 Likes Likely what holds you back the most are all the allocations. Allocations are notorious for killing scaling in parallelization. See the short snippet below for example: A rough estimate says that you allocate ~18 new matrices. And this is neither the full code nor a careful analysis. I commented about his before but you didnβt want to change the structure too much, so it might be hard to optimize further. Are the parameters realistic in the snippet above? If they are you might need to rethink your parallelization strategy alltogether because there are only 41 (determined by dd) jobs so having 80 workers does not benefit you. The value of dd used was 20001, the snippetβs dd=41 reflects a smaller benchmark that I was using to get a first estimate. And if the problem is the structure of the code β which uses smaller matrices to define large matrices causing more allocations. Shouldnβt the Fortran code also be affected by that ? Quoting the innermost loop of the Fortran code (the link to which I have provided)  NP1=MYID+1 !MYID = PROCESSOR ID, NP1=TEMPORARY PROCESSOR ID DO K=NP1,N,NP DO M=1,N DO I=1,N DO J=1,N IF(I.EQ.K)THEN G11(I,J)=-BS_COMPLEX(K,K)*X2(M,J) G12(I,J)=BS_COMPLEX(K,K)*X4(M,J) G21(I,J)=AS_COMPLEX(K,K)*X2(M,J) G22(I,J)=-AS_COMPLEX(K,K)*X4(M,J) ELSE G11(I,J)=(0.0D0,0.0D0) G12(I,J)=(0.0D0,0.0D0) G21(I,J)=(0.0D0,0.0D0) G22(I,J)=(0.0D0,0.0D0) ENDIF ENDDO IF(I.EQ.K)THEN H1(I,1)=BS_COMPLEX(K,K)*V1(M,1) H2(I,1)=-AS_COMPLEX(K,K)*V1(M,1) ELSE H1(I,1)=(0.0D0,0.0D0) H2(I,1)=(0.0D0,0.0D0) ENDIF ENDDO G(1:N,1:N)=G11 G(1:N,N+1:N1)=G12 G(N+1:N1,1:N)=G21 G(N+1:N1,N+1:N1)=G22 H(1:N,1:N)=H1 H(N+1:N1,1:N)=H2 !TRANSPOSE OF THE H VECTOR DO I=1,N1 H_TRANS(1,I)=H(I,1) ENDDO !MATRIX MULTIPLICATION FOR THE SPIN-VIBRONIC PART !THIS IS FOR THE Tr(GK^(-1))+ [(K^(-1)F)^(T)G(K^(-1)F)]-H^(T)K^(-1)F CALL ZGEMM('N1','N1',N1,N1,N1,(1.0D0,0.0D0),G,N1,AK_INV,N1, & (0.0D0,0.0D0),GK_INV,N1) CALL ZGEMM('N1','N1',N1,1,N1,(1.0D0,0.0D0),AK_INV,N1,F,N1, & (0.0D0,0.0D0),AKINV_F,N1) DO I=1,N1 AKINV_F_TRANS(1,I)=AKINV_F(I,1) !TRANSPOSE OF AKINV_F i.e. K^(-1)F ENDDO CALL ZGEMM('N1','N1',1,N1,N1,(1.0D0,0.0D0),AKINV_F_TRANS,1,G,N1, & (0.0D0,0.0D0),AKINV_T_G,1) CALL ZGEMM('N1','N1',1,1,N1,(1.0D0,0.0D0),AKINV_T_G,1,AKINV_F, & N1,(0.0D0,0.0D0),NUM3,1) CALL ZGEMM('N1','N1',1,1,N1,(1.0D0,0.0D0),H_TRANS,1,AKINV_F,N1, & (0.0D0,0.0D0),NUM4,1) TRACE_GK_INV=(0.0D0,0.0D0) DO I=1,N1 DO J=1,N1 IF(I.EQ.J)THEN TRACE_GK_INV=TRACE_GK_INV+(GK_INV(I,J)) ELSE TRACE_GK_INV=TRACE_GK_INV+(0.0D0,0.0D0) ENDIF ENDDO ENDDO TRACE_GK_INV_REAL=REAL(TRACE_GK_INV) TRACE_GK_INV_IMAG=AIMAG(TRACE_GK_INV) TRACE=COMPLEX(-TRACE_GK_INV_IMAG,TRACE_GK_INV_REAL) !TOTAL SPIN-VIBRONIC PART SV=SV+((TRACE+NUM3(1,1)-NUM4(1,1))*TKK_COMPLEX(K,M)) ENDDO !END LOOP FOR K ENDDO !END LOOP FOR M  This also seems to use things like splicing and using smaller matrices to repeatedly define large matrices. Forgive me if my interpretation of the Fortran code is wrong, I am totally unfamiliar with Fortran. And if the allocations are what is holding me back, canβt I just pass scratch spaces into the function with the same name BB or AA and change the BB = into BB .= ? wouldnβt that now force the program to reuse the memory instead of allocating fresh ? The same problem can also be circumvented by using the @tensor macro right ? Edit 1 β I tried to reduce the allocations by using in-place multiplication provided by LinearAlgebra. But the code seems to become slower (?) The allocations decrease as expected. Here are the results anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia serial.jl
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18966679349
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Time                    Allocations
βββββββββββββββββββββββ   ββββββββββββββββββββββββ
Tot / % measured:         66.2s /  84.9%           1.51GiB /  27.8%

Section          ncalls     time    %tot     avg     alloc    %tot      avg
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GT calculation        1    56.2s  100.0%   56.2s    429MiB  100.0%   429MiB
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$julia 2serial.jl Number of processes = 1 GKS: Possible loss of precision in routine SET_WINDOW The rate constant is 42009.18966679349 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ Time Allocations βββββββββββββββββββββββ ββββββββββββββββββββββββ Tot / % measured: 70.5s / 84.6% 1.58GiB / 25.1% Section ncalls time %tot avg alloc %tot avg βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ GT calculation 1 59.6s 100.0% 59.6s 407MiB 100.0% 407MiB βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ  The second one uses in-place multiplication and reuses memory in the part that you highlighted. Here is the modified GDSO function function GDSO( t::Float64, b::Float64, Ξ::Float64, sysize::Int64, Ξ©f::Diagonal{Float64}, Ξ©i::Diagonal{Float64}, J::Matrix{Float64}, D::Matrix{Float64}, Ο::Diagonal{Float64}, Si::Diagonal{ComplexF64}, Sf::Diagonal{ComplexF64}, Bi::Diagonal{ComplexF64}, Bf::Diagonal{ComplexF64}, JpBi::Matrix{ComplexF64}, JpSi::Matrix{ComplexF64}, BB::Matrix{ComplexF64}, AA::Matrix{ComplexF64}, W::Matrix{ComplexF64}, Wapp::Matrix{ComplexF64}, Wapp_i1::Matrix{ComplexF64} ) t = t * 4.1341373335e16 # converting time to atomic units tp = -im * b - t for i = 1:sysize Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t) Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp) Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t) Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp) end # BB = (Bf + J' * Bi * J) # AA = (Sf + J' * Si * J) mul!(JpBi,J',Bi) mul!(BB,JpBi,J) BB .+= Bf mul!(JpSi,J',Si) mul!(AA,JpSi,J) AA .+= Sf # W = [BB -AA # -AA BB] W[1:sysize,1:sysize] .= BB W[1:sysize,sysize+1:2*sysize] .= -AA W[sysize+1:2*sysize,1:sysize] .= -AA W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB #We wont calculate the 2Nx2N determinant directly. # Wapp = BB * (BB - AA * (BB \ AA)) BBinvAA = BB \ AA mul!(Wapp_i1,-AA,BBinvAA) Wapp_i1 .+= BB mul!(Wapp,BB,Wapp_i1) V = [J' * (Bi-Si) * D J' * (Bi-Si) * D] F1 = sqrt(det(Si * Sf * (Wapp \ Ο^2))) F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D)) g = F1 * exp(F2) Ξ = Ξ * 0.0367493 # in atomic units g = g * exp(im * Ξ * t) Ξ· = 2 * 4.55633e-6 # 10 cm^-1 to hatree energy g = g * exp(-Ξ· * t^2) return g[1] end  Please note - these benchmarks have been run with a small dd = 21 on my local machine in a serial fashion. But as far as I understand it does not allocate new matrices for every loop. I think the matrices are allocated only once for all runs which is exactly what one should do. You can see everything the Fortran code uses at the very top of the file:  REAL*8,DIMENSION(159,159)::RJ,RJT,OMEGAS,OMEGAT,BS,AS,AT1,TKK REAL*8,DIMENSION(159,1)::WS,WT,D,NAC REAL*8,DIMENSION(1,159)::DT,NAC_T COMPLEX*16,DIMENSION(159,1)::H1,H2,D_COMPLEX,V1 COMPLEX*16,DIMENSION(1,159)::DT_COMPLEX,V3 COMPLEX*16,DIMENSION(159,159)::A,B,AT,BT,BB,B_INV,AA,B2, & AA_INV,E2,E1,AB_INV,G11,G12,G21,G22,RJ_COMPLEX, & RJT_COMPLEX,X1,X2,X3,X4,X5,Y1,BS_COMPLEX, & OMEGAT_COMPLEX,AS_COMPLEX, & E,TKK_COMPLEX,BT2,AT2 COMPLEX*16,DIMENSION(318,318)::AK,AK_INV,G,GK_INV COMPLEX*16,DIMENSION(318,1)::F,H,AKINV_F COMPLEX*16,DIMENSION(1,318)::FT,AKINV_F_TRANS,AKINV_T_G, & H_TRANS,V2 REAL*8::TMIN,TMAX,HH,CM_AU,S_AU,T,SOC1,AMU_AU,CONST1,AU_HZ, & BETA,KB,KBT,TEMP,EV_AU,PF,AMP,DET_I,DET_R, & DELE1,NUM1R,NUM2R,NUM1I,NUM2I,NUMR,NUMI, & TRACE_GK_INV_REAL,TRACE_GK_INV_IMAG,K_RISC_DSO_REAL, CONST2,SOC2,DELE2,THETA,T1 COMPLEX*16::DET,TRACE_GK_INV,SV,RISC_DSO,RISC_SV,SV2, & NUM,TRACE,K_RISC_DSO,K_RISC_SV,SV1 COMPLEX*16,DIMENSION(120)::SV1 COMPLEX*16,DIMENSION(1,1)::NUM1,NUM2,NUM3,NUM4 INTEGER::I,J,K,M,K1,N,N1,M1,INFO,M2,NP,IERROR,MYID,NP1 INTEGER,DIMENSION(318)::IPIV COMPLEX*16,DIMENSION(318)::WORK  Also you repeat these calculations inside GDSO and GSV:  t = t * 4.1341373335e16 # converting time to atomic units tp = -im * b - t for i = 1:sysize Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t) Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp) Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t) Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp) end BB = (Bf + J' * Bi * J) AA = (Sf + J' * Si * J) W = [BB -AA -AA BB] #We wont calculate the 2Nx2N determinant directly. # U = Bi - Si V = [J' * (Bi-Si) * D J' * (Bi-Si) * D] # Wapp = BB * (BB - AA * (BB \ AA)) # this is unique to GDSO  In this code block you should also avoid repeated calculation. You did fix some them in GDSO but 1) there are more (see V) and you didnβt fix it in GSV. Well as you can see you did remove 22MiB out of 429MiB which is not a really huge amountβ¦ Why it is slower I donβt know, maybe you made it more inefficient in the process? Note that this mul!(Wapp_i1,-AA,BBinvAA) still allocates a new matrix for -A. You need to do mul!(Wapp_i1,AA,BBinvAA,-1,0). These lines you didnβt touch but they also allocate a lot:  V = [J' * (Bi-Si) * D J' * (Bi-Si) * D] F1 = sqrt(det(Si * Sf * (Wapp \ Ο^2))) F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))  And GSV you didnβt touch either? Then this benchmark might also tell you that GSV might be where the code spents more time. Yes I shall do the same improvements in the other places and report the improvement. However, all I did so far was change some multiplications into mul! thereby reusing memory. If anything this should increase the speed right? Where will this βinefficiencyβ that I supposedly created come from? Yes in principle avoiding as much allocation as possible should speed it up. The βinefficiencyβ I referred to was the mul!(Wapp_i1,-AA,BBinvAA). Also I am not sure if the TimerOutputs measure anything meaningful. Could you for further test just comment out the @distributed loop and run this instead: # in calc # ... # @timeit to "GT Calculation" x = @distributed vcat for ichunk in 1:nchunks # xrange,chunknum = xrange_chunknum[ichunk] # worker_function(xrange, t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso) # end @timeit to "GT Calculation" x = worker_function(xrange, t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso) # ... rest of code  This just makes it sure that we get the correct allocation amounts. The βsmallβ benchmarks that I just reported were all run using the serial version (without any type of parallelization) so that we can focus on things inherent to the structure of the code. So I do think the results are meaningful. I did make the change using the 5 parameter mul! as you suggested and it decreased the allocation a bit more as expected. It still seemed to not have much effect on the speed at this scale anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18966679349
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Time                    Allocations
βββββββββββββββββββββββ   ββββββββββββββββββββββββ
Tot / % measured:         66.0s /  83.2%           1.65GiB /  23.9%

Section          ncalls     time    %tot     avg     alloc    %tot      avg
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GT calculation        1    54.9s  100.0%   54.9s    405MiB  100.0%   405MiB
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ


However, on trying to move ahead with using mul! everywhere, I have encountered this peculiar issue where replacing the V matrix initialization with something that should in principle allocate less memory as below

    # V = [J' * (Bi-Si) * D
#      J' * (Bi-Si) * D]
BimSi = Bi - Si
mul!(V_int1,BimSi,D)
mul!(V_int2,J',V_int1)
V[1:sysize,1] .= V_int2
V[sysize+1:2*sysize,1] .= V_int2


Now instead increases allocation (!?) with the new amount of allocations given below

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$julia 2serial.jl Number of processes = 1 The rate constant is 42009.18966679341 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ Time Allocations βββββββββββββββββββββββ ββββββββββββββββββββββββ Tot / % measured: 67.8s / 83.8% 1.70GiB / 24.8% Section ncalls time %tot avg alloc %tot avg βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ GT calculation 1 56.8s 100.0% 56.8s 432MiB 100.0% 432MiB βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ  That is very very weird indeedβ¦ And I donβt understand why tbh. The replacement you show should allocate only the BimSi matrix. Stuff like this can happen if you use some global variables but I didnβt see any. Especially sysize should be local. But maybe double check that everything you use is indeed a local variable. However I have 2 comments: • V apparently has size (2sysize,1) but this is a matlabism AFAIK. In Julia you can have proper vectors so it would be better to make V have size (2sysize) instead. • You can save on some of those intermediate arrays by directly multiplying into the correct memory e.g. you can remove V_int2 mul!(view(V, 1:sysize)),J',V_int1) V[sysize+1:end] .= @view V[1:sysize]  The mul! is incompatible with the data-type that view(..) outputs. About the βweirdβ thing : I did check that initially sysize is initialized as a const global variable, but then it is passed as a parameter to calc(..) so I guess that makes it local. Below is the current working version of the code (I also removed some allocations from GSV that are already being done by GDSO and reused the variables). using MKL using DelimitedFiles using LinearAlgebra using LaTeXStrings using TimerOutputs using Plots using Plots.PlotMeasures using FFTW using SharedArrays using TensorOperations BLAS.set_num_threads(1) const to = TimerOutput() println("Number of processes = 1") open("ISC-SERIAL.TXT", "a") do f write(f, "\n\n\n====================================================\n\n\n") end @views function GDSO( t::Float64, b::Float64, Ξ::Float64, sysize::Int64, Ξ©f::Diagonal{Float64}, Ξ©i::Diagonal{Float64}, J::Matrix{Float64}, D::Matrix{Float64}, Ο::Diagonal{Float64}, Si::Diagonal{ComplexF64}, Sf::Diagonal{ComplexF64}, Bi::Diagonal{ComplexF64}, Bf::Diagonal{ComplexF64}, JpBi::Matrix{ComplexF64}, JpSi::Matrix{ComplexF64}, BB::Matrix{ComplexF64}, AA::Matrix{ComplexF64}, W::Matrix{ComplexF64}, Wapp::Matrix{ComplexF64}, Wapp_i1::Matrix{ComplexF64}, V::Matrix{ComplexF64}, V_int1::Matrix{ComplexF64}, V_int2::Matrix{ComplexF64} ) t = t * 4.1341373335e16 # converting time to atomic units tp = -im * b - t for i = 1:sysize Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t) Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp) Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t) Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp) end # BB = (Bf + J' * Bi * J) # AA = (Sf + J' * Si * J) mul!(JpBi,J',Bi) mul!(BB,JpBi,J) BB .+= Bf mul!(JpSi,J',Si) mul!(AA,JpSi,J) AA .+= Sf # W = [BB -AA # -AA BB] W[1:sysize,1:sysize] .= BB W[1:sysize,sysize+1:2*sysize] .= -AA W[sysize+1:2*sysize,1:sysize] .= -AA W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB #We wont calculate the 2Nx2N determinant directly. # Wapp = BB * (BB - AA * (BB \ AA)) BBinvAA = BB \ AA mul!(Wapp_i1,AA,BBinvAA,-1,0) Wapp_i1 .+= BB mul!(Wapp,BB,Wapp_i1) # V = [J' * (Bi-Si) * D # J' * (Bi-Si) * D] BimSi = Bi - Si mul!(V_int1,BimSi,D) mul!(V_int2,J',V_int1) V[1:sysize,1] .= V_int2 V[sysize+1:2*sysize,1] .= V_int2 F1 = sqrt(det(Si * Sf * (Wapp \ Ο^2))) F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D)) g = F1 * exp(F2) Ξ = Ξ * 0.0367493 # in atomic units g = g * exp(im * Ξ * t) Ξ· = 2 * 4.55633e-6 # 10 cm^-1 to hatree energy g = g * exp(-Ξ· * t^2) return g[1] end @views function GSV( t::Float64, b::Float64, sysize::Int64, Ξ©f::Diagonal{Float64}, Ξ©i::Diagonal{Float64}, J::Matrix{Float64}, D::Matrix{Float64}, Si::Diagonal{ComplexF64}, Sf::Diagonal{ComplexF64}, Bi::Diagonal{ComplexF64}, Bf::Diagonal{ComplexF64}, gdsopart::ComplexF64, T::Matrix{Float64}, ## Product if NAC and SO Hso::Float64, X::Matrix{ComplexF64}, Y::Matrix{ComplexF64}, W::Matrix{ComplexF64}, V::Matrix{ComplexF64} ) ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ############################## t = t * 4.1341373335e16 # converting time to atomic units tp = -im * b - t # for i = 1:sysize # Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t) # Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp) # Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t) # Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp) # end # W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J) # -(Sf + J' * Si * J) (Bf+J'*Bi*J)] # # U = Bi - Si # V = [J' * (Bi-Si) * D # J' * (Bi-Si) * D] ################# Final Calculation and returning value ################## s = zero(ComplexF64) MJSJ = J' * Si * J MJBJ = J' * Bi * J MJUD = J' * (Bi-Si) * D Win = inv(W) WinV = Win * V XWin = similar(Win) for m in 1:sysize for mp in 1:sysize ########## Defining the X and Y Matrices ################################## X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m]) X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m]) X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m]) X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m]) Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1] Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1] g = im * (tr(mul!(XWin, X, Win)) + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1]) s = s + g * gdsopart * T[m, mp] end X[m, 1:sysize] .= 0 X[m, sysize+1:2*sysize] .= 0 X[sysize+m, 1:sysize] .= 0 X[sysize+m, sysize+1:2*sysize] .= 0 Y[m, 1] = 0 Y[m+sysize, 1] = 0 end s = s + abs(Hso)^2 * gdsopart #### LATEST MODIFICATION return s end const hso_s1t3 = 1.49 * 0.0000046 ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units const Delta = 0.11 # in eV const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units const dd = 22 # Change this to modify number of points const Temp = 300 kT_temp = Temp * 1.380649e-23 # Boltzmann constant times temperature in Joules const kT = kT_temp * 2.29371227840e17 # in atomic units const b = 1 / kT const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Final state frequencies in atomic units const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Initial state frequencies in atomic units const sysize = size(omegai)[1] const P = @. 2 * sinh(b * omegai / 2) T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2) const D = readdlm("d.txt", Float64) # Displacement Matrix const J = readdlm("j.txt", Float64) # Duchinsky Matrix const t = collect(range(-5e-12, stop=5e-12, length=dd)) t[div(dd + 1, 2)] = 10e-25 function calc(t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P) # @sync tells Julia to wait for any tasks started inside it open("ISC-SERIAL.TXT", "w") do f write(f, "STARTING LOOP WITH NUMBER OF CORES = 1 \n====================================================\n") end x = zeros(ComplexF64, dd) Sf = Diagonal(zeros(ComplexF64, sysize)) Si = Diagonal(zeros(ComplexF64, sysize)) Bf = Diagonal(zeros(ComplexF64, sysize)) Bi = Diagonal(zeros(ComplexF64, sysize)) X = zeros(ComplexF64, 2 * sysize, 2 * sysize) Y = zeros(ComplexF64, 2 * sysize, 1) BB = zeros(ComplexF64, sysize, sysize) AA = zeros(ComplexF64, sysize, sysize) JpBi = zeros(ComplexF64, sysize, sysize) JpSi = zeros(ComplexF64, sysize, sysize) W = zeros(ComplexF64, 2 * sysize, 2 * sysize) Wapp = zeros(ComplexF64, sysize, sysize) Wapp_i1 = zeros(ComplexF64, sysize, sysize) V = zeros(ComplexF64, 2 * sysize,1) V_int1 = zeros(ComplexF64, sysize,1) V_int2 = zeros(ComplexF64, sysize,1) Hso = 0.48 * 0.0000046 ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units @timeit to "GT calculation" for i in 1:dd gdsopart = GDSO(t[i], b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, Si, Sf, Bi, Bf,JpBi,JpSi,BB,AA,W,Wapp,Wapp_i1,V,V_int1,V_int2) gtotal = GSV(t[i], b, sysize, Ξ©f, Ξ©i, J, D, Si, Sf, Bi, Bf,gdsopart, T,Hso,X,Y,W,V) x[i] = gtotal open("ISC-SERIAL.TXT", "a") do f write(f, "ITERATION$i COMPLETED BY PROCESS 1 \n")
end
end # loop
open("ISC-SERIAL.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
p = plot(t, yr, linewidth=2, color="blue", title="Ξ = $Ξ", label="Real part", grid=false) p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15)) savefig(p, "gt.svg") begin # This part does FFTW and finds final rate constant fs = abs(t[1] - t[2])^-1 signal = yr p = plan_fft(signal) F = fftshift(p * signal) freqs = fftshift(fftfreq(length(t), fs)) # X axis has frequencies in s^-1 #Covert freqs to eV freqs = freqs * 4.13558e-15 F = F * 6.57e15 time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}") freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4)) p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm) savefig(p, "gt-fft.svg") ee = Ξ# * 8065.73 #Converting eV to cm^-1 index = findmin(abs.(freqs .- ee))[2] # Find the index of freq closest to ee freqs[index] println("The rate constant is ", abs.(F)[index]) rc = abs.(F)[index] end #################################################This part writes the output to a file####################################################### begin open("ISC-SERIAL.TXT", "a") do f write(f, "-------------------------\n") write(f, "NUMBER OF CORES = 1\n") write(f, "The value of Ξ is$Ξ\n")
write(f, "Number of time points = $dd\n") write(f, "The rate constant calculated from Julia FFTW =$rc\n")
write(f, "-------------------------\n")
write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
for i in 1:dd
write(f, "$(t[i])$(yr[i])  $(yi[i])\n") end end open("GT-VALUES-SERIAL.txt", "w") do f for i in 1:dd write(f, "$(t[i])  $(yr[i])$(yi[i])\n")
end
end

end

end

calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
display(to)



And weirdly enough, as expected the allocations have decreased further thanks to changes to GSV , the code seems to be getting even slower !! , below are the benchmarks

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$julia 2serial.jl Number of processes = 1 GKS: Possible loss of precision in routine SET_WINDOW The rate constant is 42009.189510825505 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ Time Allocations βββββββββββββββββββββββ ββββββββββββββββββββββββ Tot / % measured: 73.8s / 82.9% 1.65GiB / 23.5% Section ncalls time %tot avg alloc %tot avg βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ GT calculation 1 61.2s 100.0% 61.2s 398MiB 100.0% 398MiB βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ  Would you be interested in using MPI with Julia as well via MPI.jl? 1 Like No? julia> using LinearAlgebra julia> M = rand(10,10); V=rand(10); X=zeros(20); julia> mul!(view(X,1:10), M, V) 10-element view(::Vector{Float64}, 1:10) with eltype Float64: 3.0683134866939032 2.227376927237792 2.013449475616838 1.8554901380575193 1.606656183847676 2.014155104694141 2.882426788235068 2.300075183314828 2.2319169643448094 1.7444395747531267  Something is very very wrong. The allocations did decrease only very very little! You started with 412MiB or so and now itβs still 398MiB. You should try to profile more in detail. Try littering the GDSO and GSV functions with more @timeit statements, so we can try and locate the source of these allocations. 1 Like Yes I am in the process of doing the same. One thing we can be sure of is that almost all the allocation is coming from the GSV part as shown by the extra @timeit I have added on the function calls anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18862179286
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Time                    Allocations
βββββββββββββββββββββββ   ββββββββββββββββββββββββ
Tot / % measured:         69.5s /  82.9%           1.61GiB /  22.5%

Section          ncalls     time    %tot     avg     alloc    %tot      avg
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GT calculation        1    57.6s  100.0%   57.6s    370MiB  100.0%   370MiB
GSV                22    57.6s  100.0%   2.62s    347MiB   93.7%  15.8MiB
GDSO               22   25.5ms    0.0%  1.16ms   23.3MiB    6.3%  1.06MiB
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ


Another thing that might be important is that the calculated value changes slightly on reusing the W and V variables. The reused version reports (time, real_part, imaginary_part)

1.0e-24  6.394092795554566e-12  4.2656647577696985e-16


Whereas the older version reports

1.0e-24  6.394092636498152e-12  4.265664651939934e-16


But they should be exactly the same right?

The version of the code that reuses some variables set by GDSO in GSV is given below

using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using TensorOperations
const to = TimerOutput()
println("Number of processes = 1")
open("ISC-SERIAL.TXT", "a") do f
write(f, "\n\n\n====================================================\n\n\n")
end

@views function GDSO(
t::Float64,
b::Float64,
Ξ::Float64,
sysize::Int64,
J::Matrix{Float64},
D::Matrix{Float64},
Ο::Diagonal{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64},
JpBi::Matrix{ComplexF64},
JpSi::Matrix{ComplexF64},
BB::Matrix{ComplexF64},
AA::Matrix{ComplexF64},
W::Matrix{ComplexF64},
Wapp::Matrix{ComplexF64},
Wapp_i1::Matrix{ComplexF64},
V::Matrix{ComplexF64},
V_int1::Matrix{ComplexF64},
V_int2::Matrix{ComplexF64},
BimSi::Diagonal{ComplexF64}

)
t = t * 4.1341373335e16                     # converting time to atomic units
tp = -im * b - t
for i = 1:sysize
end
# BB = (Bf + J' * Bi * J)
# AA = (Sf + J' * Si * J)
mul!(JpBi,J',Bi)
mul!(BB,JpBi,J)
BB .+= Bf
mul!(JpSi,J',Si)
mul!(AA,JpSi,J)
AA .+= Sf

# W = [BB -AA
#     -AA BB]
W[1:sysize,1:sysize] .= BB
W[1:sysize,sysize+1:2*sysize] .= -AA
W[sysize+1:2*sysize,1:sysize] .= -AA
W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB

#We wont calculate the 2Nx2N determinant directly.
# Wapp = BB * (BB - AA * (BB \ AA))
BBinvAA = BB \ AA
mul!(Wapp_i1,AA,BBinvAA,-1,0)
Wapp_i1 .+= BB
mul!(Wapp,BB,Wapp_i1)

# V = [J' * (Bi-Si) * D
#      J' * (Bi-Si) * D]
BimSi .= Bi - Si
mul!(V_int1,BimSi,D)
mul!(V_int2,J',V_int1)
V[1:sysize,1] .= V_int2
V[sysize+1:2*sysize,1] .= V_int2

F1 = sqrt(det(Si * Sf * (Wapp \ Ο^2)))
F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
g = F1 * exp(F2)
Ξ = Ξ * 0.0367493  # in atomic units
g = g * exp(im * Ξ * t)
Ξ· = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
g = g * exp(-Ξ· * t^2)
return g[1]
end

@views function GSV(
t::Float64,
b::Float64,
sysize::Int64,
J::Matrix{Float64},
D::Matrix{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64},
gdsopart::ComplexF64,
T::Matrix{Float64},       ## Product if NAC and SO
Hso::Float64,
X::Matrix{ComplexF64},
Y::Matrix{ComplexF64},
W::Matrix{ComplexF64},
V::Matrix{ComplexF64}
)

################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
t = t * 4.1341373335e16                     # converting time to atomic units
# tp = -im * b - t
# for i = 1:sysize
# end
# W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
#     -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
# # U = Bi - Si
# V = [J' * (Bi-Si) * D
#     J' * (Bi-Si) * D]

################# Final Calculation and returning value ##################
s = zero(ComplexF64)
MJSJ = J' * Si * J
MJBJ = J' * Bi * J
MJUD = J' * (Bi-Si) * D
Win = inv(W)
WinV = Win * V
XWin = similar(Win)
for m in 1:sysize
for mp in 1:sysize
########## Defining the X and Y Matrices ##################################
X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m])
X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m])
X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m])
X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m])

Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1]
Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1]

g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
s = s + g * gdsopart * T[m, mp]

end
X[m, 1:sysize] .= 0
X[m, sysize+1:2*sysize] .= 0
X[sysize+m, 1:sysize] .= 0
X[sysize+m, sysize+1:2*sysize] .= 0
Y[m, 1] = 0
Y[m+sysize, 1] = 0
end
s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
return s
end

const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 22                            # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17         # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25

# @sync tells Julia to wait for any tasks started inside it
open("ISC-SERIAL.TXT", "w") do f
write(f, "STARTING LOOP WITH NUMBER OF CORES = 1 \n====================================================\n")
end
x = zeros(ComplexF64, dd)
Sf = Diagonal(zeros(ComplexF64, sysize))
Si = Diagonal(zeros(ComplexF64, sysize))
Bf = Diagonal(zeros(ComplexF64, sysize))
Bi = Diagonal(zeros(ComplexF64, sysize))
X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
Y = zeros(ComplexF64, 2 * sysize, 1)
BB = zeros(ComplexF64, sysize, sysize)
AA = zeros(ComplexF64, sysize, sysize)
JpBi = zeros(ComplexF64, sysize, sysize)
JpSi = zeros(ComplexF64, sysize, sysize)
W = zeros(ComplexF64, 2 * sysize, 2 * sysize)
Wapp = zeros(ComplexF64, sysize, sysize)
Wapp_i1 = zeros(ComplexF64, sysize, sysize)
V =  zeros(ComplexF64, 2 * sysize,1)
V_int1 = zeros(ComplexF64, sysize,1)
V_int2 = zeros(ComplexF64, sysize,1)
BimSi = Diagonal(zeros(ComplexF64, sysize, sysize))
Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
@timeit to "GT calculation" for i in 1:dd
@timeit to "GDSO" gdsopart = GDSO(t[i], b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, Si, Sf, Bi, Bf,JpBi,JpSi,BB,AA,W,Wapp,Wapp_i1,V,V_int1,V_int2,BimSi)
@timeit to "GSV" gtotal = GSV(t[i], b, sysize, Ξ©f, Ξ©i, J, D, Si, Sf, Bi, Bf,gdsopart, T,Hso,X,Y,W,V)
x[i] = gtotal
open("ISC-SERIAL.TXT", "a") do f
write(f, "ITERATION $i COMPLETED BY PROCESS 1 \n") end end # loop open("ISC-SERIAL.TXT", "a") do f write(f, "LOOP COMPLETE \n====================================================") end yr = real.(x) yi = imag.(x) p = plot(t, yr, linewidth=2, color="blue", title="Ξ =$Ξ", label="Real part", grid=false)
p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
savefig(p, "gt.svg")
begin # This part does FFTW and finds final rate constant

fs = abs(t[1] - t[2])^-1
signal = yr
p = plan_fft(signal)
F = fftshift(p * signal)
freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
#Covert freqs to eV
freqs = freqs * 4.13558e-15
F = F * 6.57e15
time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
savefig(p, "gt-fft.svg")
ee = Ξ# * 8065.73  #Converting eV to cm^-1
index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
freqs[index]
println("The rate constant is ", abs.(F)[index])
rc = abs.(F)[index]
end
#################################################This part writes the output to a file#######################################################
begin
open("ISC-SERIAL.TXT", "a") do f
write(f, "-------------------------\n")
write(f, "NUMBER OF CORES = 1\n")
write(f, "The value of Ξ is $Ξ\n") write(f, "Number of time points =$dd\n")
write(f, "The rate constant calculated from Julia FFTW = $rc\n") write(f, "-------------------------\n") write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n") for i in 1:dd write(f, "$(t[i])  $(yr[i])$(yi[i])\n")
end
end

open("GT-VALUES-SERIAL.txt", "w") do f
for i in 1:dd
write(f, "$(t[i])$(yr[i])  $(yi[i])\n") end end end end calc(t, b, Delta, sysize, omegaf, omegai, J, D, P) display(to)  And about this On modifying my code as follows, so as to use the view(..)  # V = [J' * (Bi-Si) * D # J' * (Bi-Si) * D] BimSi .= Bi - Si mul!(V_int1,BimSi,D) mul!(view(V, 1:sysize),J',V_int1) V[sysize+1:end] .= V[1:sysize] # mul!(V_int2,J',V_int1) # V[1:sysize,1] .= V_int2 # V[sysize+1:2*sysize,1] .= V_int2  I seem to be getting the following error ERROR: LoadError: MethodError: no method matching mul!(::SubArray{ComplexF64, 1, Vector{ComplexF64}, Tuple{UnitRange{Int64}}, true}, ::Adjoint{Float64, Matrix{Float64}}, ::Matrix{ComplexF64}, ::Bool, ::Bool) Closest candidates are: mul!(::AbstractVecOrMat, ::UniformScaling, ::AbstractVecOrMat, ::Number, ::Number) @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/uniformscaling.jl:284 mul!(::AbstractVector, ::AbstractVecOrMat, ::AbstractVector, ::Number, ::Number) @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/matmul.jl:66 mul!(::AbstractMatrix, ::AbstractVecOrMat, ::AbstractVecOrMat, ::Number, ::Number) @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/matmul.jl:263 ...  It seems that your V_int1 is a matrix and then this error could mean that you tried to multiply to matrices and store the result in a vector. A quick fix would be to use view(V, 1:sysize, 1:1) but better would be to just use vectors for things that are vectors, i.e. make V_int1 a vector instead. If it is nothing like that, then it is possible that there is a missing dispatch in LinearAlgebra.jl which happens from time to time especially with complicated combinations of wrappers, views and the like. Not necessarily. It might that you change up the order of floating point operation by restructering the code. That can very well lead to tiny changes in the output due to rounding. Btw, this exchanges the order of multiplication of the matrices I think. So that could be a source where rounding could go differently and cause your final results to deviate ever so slightly. This causes another problem as BimSi is a Diagonal data-type. In the REPL you can see julia> V_int1 = rand(Float64,10) 10-element Vector{Float64}: 0.41512070416026536 0.4405220835063167 0.25054297797334435 0.7442274181037184 0.5705993211669493 0.29762828607905023 0.9684752525026395 0.08052415976379501 0.49427664484057166 0.20143404991616232 julia> BimSi = Diagonal(ones(Float64,10,10)) ERROR: UndefVarError: Diagonal not defined Stacktrace: [1] top-level scope @ REPL[2]:1 julia> using LinearAlgebra julia> BimSi = Diagonal(ones(Float64,10,10)) 10Γ10 Diagonal{Float64, Vector{Float64}}: 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 β β β β β β β β β β 1.0 julia> D = rand(Float64,10,1) 10Γ1 Matrix{Float64}: 0.7193882161972547 0.4714200397295284 0.1365427136917623 0.8935252225472667 0.9779577907398268 0.5907581975550894 0.31000619896170867 0.03765584126275556 0.11379442907643522 0.9612948902973085 julia> mul!(V_int1,BimSi,D) ERROR: MethodError: no method matching mul!(::Vector{Float64}, ::Diagonal{Float64, Vector{Float64}}, ::Matrix{Float64}, ::Bool, ::Bool) Closest candidates are: mul!(::AbstractVecOrMat, ::UniformScaling, ::AbstractVecOrMat, ::Number, ::Number) @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/uniformscaling.jl:284 mul!(::AbstractVector, ::Union{Bidiagonal, Diagonal, SymTridiagonal, Tridiagonal}, ::AbstractVector, ::Number, ::Number) @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/bidiag.jl:425 mul!(::AbstractVector, ::AbstractVecOrMat, ::AbstractVector, ::Number, ::Number) @ LinearAlgebra ~/.julia/juliaup/julia-1.10.4+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/matmul.jl:66 ...  And as for your comment on Although this is true, that was not the point I was making. The output is changing slightly after I make GSV reuse the W and V matrices from GDSO simply making it reuse the matrices should not change anything from the version that does not reuse. Because β if using mul! caused the change, we would have noticed the change the moment we used mul! to define W and V in GDSO right ? But instead we only notice it when GSV is made to reuse those variables instead of redefining them. The answer to that is likely yes. MPI provides looks of capabilities that Distributed.jl currently doesnβt take advantage off. Instead of straight up using Distributed.jl using Dagger.jl might help. Generally speaking the question that I would have for you is: Are you using the same parallelization strategy between MPI and Distributed. MPI often encourages a spatial-decomposition that lends itself to scaling. While that is true for.muli-threading (and less so than it used to be, due to the parallel GC). Allocations have much less of an impact in distributed settings and are likely a red herring here. 1 Like D is a matrix as you initialized it with julia> D = rand(Float64,10,1) 10Γ1 Matrix{Float64}: # it says Matrix because it has 2 dimensions!  Just make it a proper vector: julia> D = rand(Float64,10) 10-element Vector{Float64}:  then the mul! works perfectly fine. I see. Well I donβt see why that should make change. The way you computed W and V in GSV and GDSO seems identical to me. However you did change the computation of W and V to use mul! in GDSO (which you claim did not change results) and then transferred these possibly slightly different W and V to GSV. So I think you just slightly changed the order of floating point operations which had a slight effect on the results (of order machine epsilon). I donβt think you need to worry. 1 Like Yes that worked! The results are a bit different (after 5th decimal place) as before, but you said that is not something that we need to worry about. The allocations also decreased further as expected anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc$ julia 2serial.jl
Number of processes = 1
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.188313308856
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Time                    Allocations
βββββββββββββββββββββββ   ββββββββββββββββββββββββ
Tot / % measured:         67.5s /  89.0%           1.26GiB /  28.0%

Section          ncalls     time    %tot     avg     alloc    %tot      avg
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GT calculation        1    60.1s  100.0%   60.1s    361MiB  100.0%   361MiB
GSV                22    60.0s  100.0%   2.73s    338MiB   93.6%  15.3MiB
GDSO               22   27.4ms    0.0%  1.25ms   22.9MiB    6.4%  1.04MiB
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ


But the speed seems to be getting slower and slower on decreasing allocations. I have no idea why at this point.

I will try to get an implementation that uses Dagger.jl instead, however I am completely unfamiliar with the package and thus I do not know how much code-change would that warrant.

Also the official Dagger.jl page reports that they have some βThread safety issuesβ when using Distributed computing.

This confuses me as well. However, we still are at 90% of the original allocations, so I donβt think we got to the bottom yet. I think we need more data, so Iβd suggest that you profile more closely by putting a lot of @timeit statements inside of GSV so we can see what takes the bulk of the time and then work on optimizing that. It might turnout that the allocations are not causing the slow speed after all (especially in a multiprocess setting as pointed out by @vchuravy)