This question is a continuation / generalization of my previous question. My end goal is to write a parallelized code where the main time taking part is Matrix multiplication and inversion. As per the advice of @ufechner7 @DNF @nilshg @abraemer and many other helpful community members, I have been able to optimize my code to the extent that when it is being run on a single thread, it outperforms the Fortran equivalent of the code by quite a significant margin ! [The Fortran code took about 149
seconds whereas my code took 47
seconds on using dd
= 11, which is essentially the number of points on the toplevel loop]. The latest version of my code is given below :
println("Number of threads = $(Threads.nthreads())")
open("THREADPARALLELOUTPUTRV6.TXT", "a") do f
write(f, "\n\n\n====================================================\n\n\n")
end
begin
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
BLAS.set_num_threads(1)
using Plots
using Plots.PlotMeasures
using FFTW
using Base.Threads
using SharedArrays
using ChunkSplitters
end
function GFC(t::Float64,b::Float64,Δ::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},ρ::Diagonal{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64})
t = t* 4.1341373335*10^16 # converting time to atomic units
tp =  im*b  t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
BB = (Bf+J'*Bi*J)
AA = (Sf+J'*Si*J)
W = [BB AA
AA BB] #We wont calculate the 2Nx2N determinant directly.
# Wapp = BB*(BB AA*(BB^1)*AA )
Wapp = BB*(BB AA*(BB\AA) )
U = BiSi
V = [J'*U*D
J'*U*D]
# F1 = sqrt(det(Si*Sf*ρ^2*Wapp^1))
# F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
F1 = sqrt(det(Si*Sf* (Wapp\ρ^2)))
F2 = ((im/2)*(transpose(V)* (W\V))) + ((im)*(transpose(D)*U*D))
g = F1*exp(F2)
Δ = Δ * 0.0367493 # in atomic units
g = g * exp(im * Δ * t)
η = 2 * 4.55633 * 10^6 # 10 cm^1 to hatree energy
#g = g * exp(η * abs(t)) # Adding the damping factor Lorrentzian
g = g * exp(η * t^2) # Adding the damping factor Gaussian
# println("t = $t was evaluated by process $(myid())")
# println("GFC at t = $t is $g")
# println("F1 at t = $t = $F1")
return g[1]
end
function GHT(t::Float64,b::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64},X11::Matrix{ComplexF64},X12::Matrix{ComplexF64},X21::Matrix{ComplexF64},X22::Matrix{ComplexF64},Y1::Matrix{ComplexF64},Y2::Matrix{ComplexF64},gfcpart::ComplexF64,T::Matrix{Float64})
################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
t = t* 4.1341373335*10^16 # converting time to atomic units
tp =  im*b  t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
W = [(Bf+J'*Bi*J) (Sf+J'*Si*J)
(Sf+J'*Si*J) (Bf+J'*Bi*J)]
U = BiSi
V = [J'*U*D
J'*U*D]
########## Defining the X and Y Matrices ##################################
################# Final Calculation and returning value ##################
s=0
MJSJ = J'*Si*J
MJBJ = J'*Bi*J
MJUD = J'*U*D
Win = inv(W)
WinV = Win*V
for m in range(1,sysize)
for mp in range(1,sysize)
X11[m,:] .= (MJSJ)[mp,:] .* (Bf[m,m])
X12[m,:] .= (MJBJ)[mp,:] .* (Bf[m,m])
X21[m,:] .= (MJSJ)[mp,:] .* (Sf[m,m])
X22[m,:] .= (MJBJ)[mp,:] .* (Sf[m,m])
X = [X11 X12
X21 X22]
Y1[m,:] = Bf[m,m]*(MJUD)[mp,:]
Y2[m,:] = Sf[m,m]*(MJUD)[mp,:]
Y = [Y1
Y2]
# g = im*tr(W\X)+ (transpose(W\V)*X*(W\V))[1]  (transpose(Y)*(W\V))[1]
# g = im*tr(Win*X)+ (transpose(Win*V)*X*(Win*V))[1]  (transpose(Y)*(Win*V))[1]
g = im*tr(Win*X)+ (transpose(WinV)*X*(WinV))[1]  (transpose(Y)*(WinV))[1]
s = s+g[1]*gfcpart*T[m,mp]
end
for i in 1:sysize
X11[m,i]=0
X12[m,i]=0
X21[m,i]=0
X22[m,i]=0
Y1[m,1]=0
Y2[m,1]=0
end
end
return s[1]
end
begin ## Define all constant variables
const Delta::Float64 = 2.02 # in eV
const dd ::Int64 = 11 # Change this to modify number of points
const Temp::Int64=300
kT = Temp * 1.380649 * 10^23 # Boltzmann constant times temperature in Joules
kT= kT * 2.29371227840*10^17 # in atomic units
const b ::Float64= 1/kT
const omegaf ::Diagonal{Float64}= Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n')))*4.55633*10^6 # Final state frequencies in atomic units
const omegai ::Diagonal{Float64}= Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n')))*4.55633*10^6 # Initial state frequencies in atomic units
const sysize ::Int64 = size(omegai)[1]
const P = @. 2*sinh(b*omegai/2)
T ::Matrix{Float64}= readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
T = T*T';
const D ::Matrix{Float64}= readdlm("d.txt", Float64); # Displacement Matrix
const J ::Matrix{Float64}= readdlm("j.txt", Float64); # Duchinsky Matrix
t ::Vector{Float64}= collect(range(5*10^12,stop=5*10^12,length=dd))
t[div(dd+1,2)] = 10e25
end
function calc(t,b,Δ,sysize,Ωf,Ωi,J,D,P)
nchunks = Threads.nthreads() # define number of chunks e.g. equal to number of threads
# @sync tells Julia to wait for any tasks started inside it
open("THREADPARALLELOUTPUTRV6.TXT", "a") do f
write(f, "STARTING LOOP WITH NUMBER OF THREADS = $(Threads.nthreads()) \n====================================================\n")
end
x = zeros(ComplexF64,dd)
@time @sync for (xrange, ichunk) in chunks(range(1,dd), nchunks)
# xrange are the indices this chunk contains, ichunk is just the index of the chunk, which we don't need in this scenario
Threads.@spawn begin # this line tells Julia to start a new thread with the contents of the begin block
Sf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Si ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Bf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Bi ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
X11 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X12 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X21 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X22 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
Y1 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,1)
Y2 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,1)
for i in xrange
gfcpart = GFC(t[i],b,Δ,sysize,Ωf,Ωi,J,D,P,Si,Sf,Bi,Bf)
gtotal = GHT(t[i],b,sysize,Ωf,Ωi,J,D,Si,Sf,Bi,Bf,X11,X12,X21,X22,Y1,Y2,gfcpart,T)
x[i] = gtotal
open("THREADPARALLELOUTPUTRV6.TXT", "a") do f
write(f, "ITERATION $i COMPLETED BY THREAD $(Threads.threadid()) \n")
end
end # loop
end # thread work load
end # loop over chunks, Julia will wait until all task are completed
open("THREADPARALLELOUTPUTRV6.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
p = plot(t,yr,linewidth = 2,color = "blue",title = "Δ = $Δ",label = "Real part",grid=false)
p =plot!(t,yi,linewidth = 2,color = "red",label = "Imaginary part",size=(1000,800),xlims=(30*10^15,30*10^15))
savefig(p,"gt.svg")
begin # This part does FFTW and finds final rate constant
fs = abs(t[1]t[2])^1
signal = yr
p = plan_fft(signal)
F = fftshift(p*signal)
freqs = fftshift(fftfreq(length(t), fs)) # X axis has frequencies in s^1
#Covert freqs to eV
freqs = freqs * 4.13558 * 10^15
F = F * 6.57 * 10^15
time_domain = plot(t, signal, title = L"G(t)",legend=false,color="orange",xlabel=L"\mathrm{Time}\ (s)",ylabel=L"\mathrm{A.u}")
freq_domain = plot(freqs, abs.(F), title = L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}",legend=false,xlabel=L"\mathrm{Energy\ (eV)}",ylabel=L"\mathrm{k_{IC}\ (s^{1})}",xlims=(0,4))
p = plot(time_domain, freq_domain, layout = (2,1), size = (1000, 800),right_margin = 10mm)
savefig(p,"gtfft.svg")
ee = Δ# * 8065.73 #Converting eV to cm^1
index = findmin(abs.(freqs . ee))[2] # Find the index of freq closest to ee
freqs[index]
println("The rate constant is ",abs.(F)[index])
rc = abs.(F)[index]
end
begin # This part writes the output to a file
open("THREADPARALLELOUTPUTRV6.TXT", "a") do f
write(f, "\n")
write(f,"NUMBER OF CORES = $(Threads.nthreads())\n")
write(f,"The value of Δ is $Δ\n")
write(f,"Number of time points = $dd\n")
write(f, "The rate constant is $rc\n")
write(f,"\n")
write(f,"The values of t,Re(g(t)),Im(g(t)) are \n\n")
for i in range(1,dd)
write(f,"$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
end
end
timeneed = @elapsed calc(t,b,Delta,sysize,omegaf,omegai,J,D,P)
open("THREADPARALLELOUTPUTRV6.TXT", "a") do f
write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING GC + C + RC]= $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end
In case anybody wants to run the code on their own computers, the data files are given here.
Current Problems

As stated above, my code runs faster than the fortran code on my computer ( Intel i5 , 8th Generation Processor). But when I submit the same code for running on our supercomputer (Intel Xeon) serially, the Fortran code becomes faster, with it’s time taken becoming
49
seconds from149
seconds. Whereas my code becomes slower ! taking52
seconds instead of47
seconds. As my end goal is to run the code on such supercomputers in a parallelized fashion, I think it is of paramount importance that we sort out this discrepency in the serial code performance. 
With increasing the number of processors, my code does not speed up as much as the Fortran code. The Fortran code uses MPI parallelization, whereas my Julia code relies on the features of
Base.Threads
. When I say serial performance, I mean just running this code withjulia t 1 xxx.jl
command such that it uses only one thread. To give an example of the current scaling scenario , when on my personal PC I use10
threads and~1000
points I end up with a time of690
seconds instead of an ideally projected381
seconds. I know ideal scaling is never possible but is the scaling I am getting expected ? I also realize that the comparison I am making is not completely fair since I am not using MPI.jl that I should , as the Fortran code uses it.
Any help / suggestions / pointers that could help me solve these problems, i.e prevent my code from getting slowed down on the supercomputer and making it scale better with increasing the number of threads would be greatly appreciated.