I corrected the mistake and rewrote the code accordingly :
println("Number of threads = $(Threads.nthreads())")
open("THREAD-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "\n\n\n====================================================\n\n\n")
end
begin
using LoopVectorization
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
BLAS.set_num_threads(1)
using Plots
using Plots.PlotMeasures
using FFTW
using Base.Threads
using SharedArrays
using ChunkSplitters
end
function GFC(t::Float64,b::Float64,Δ::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},ρ::Diagonal{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64})
t = t* 4.1341373335*10^16 # converting time to atomic units
tp = - im*b - t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
BB = (Bf+J'*Bi*J)
AA = (Sf+J'*Si*J)
W = [BB -AA
-AA BB] #We wont calculate the 2Nx2N determinant directly.
# Wapp = BB*(BB- AA*(BB^-1)*AA )
Wapp = BB*(BB- AA*(BB\AA) )
U = Bi-Si
V = [J'*U*D
J'*U*D]
# F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
# F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
F1 = sqrt(det(Si*Sf* (Wapp\ρ^2)))
F2 = (-(im/2)*(transpose(V)* (W\V))) + ((im)*(transpose(D)*U*D))
g = F1*exp(F2)
Δ = Δ * 0.0367493 # in atomic units
g = g * exp(im * Δ * t)
η = 2 * 4.55633 * 10^-6 # 10 cm^-1 to hatree energy
#g = g * exp(-η * abs(t)) # Adding the damping factor Lorrentzian
g = g * exp(-η * t^2) # Adding the damping factor Gaussian
# println("t = $t was evaluated by process $(myid())")
# println("GFC at t = $t is $g")
# println("F1 at t = $t = $F1")
return g[1]
end
function GHT(t::Float64,b::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64},X11::Matrix{ComplexF64},X12::Matrix{ComplexF64},X21::Matrix{ComplexF64},X22::Matrix{ComplexF64},Y1::Matrix{ComplexF64},Y2::Matrix{ComplexF64},gfcpart::ComplexF64,T::Matrix{Float64})
################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
t = t* 4.1341373335*10^16 # converting time to atomic units
tp = - im*b - t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
W = [(Bf+J'*Bi*J) -(Sf+J'*Si*J)
-(Sf+J'*Si*J) (Bf+J'*Bi*J)]
U = Bi-Si
V = [J'*U*D
J'*U*D]
########## Defining the X and Y Matrices ##################################
################# Final Calculation and returning value ##################
s=0
MJSJ = J'*Si*J
MJBJ = J'*Bi*J
MJUD = J'*U*D
for m in range(1,sysize)
for mp in range(1,sysize)
# @. X11 = 0.0
# @. X12 = 0.0
# @. X21 = 0.0
# @. X22 = 0.0
X11[m,:]=(MJSJ)[mp,:]*(-Bf[m,m])
X12[m,:]=(MJBJ)[mp,:]*(Bf[m,m])
X21[m,:]=(MJSJ)[mp,:]*(Sf[m,m])
X22[m,:]=(MJBJ)[mp,:]*(-Sf[m,m])
X = [X11 X12
X21 X22]
# @. Y1 = 0.0
# @. Y2 = 0.0
Y1[m,:] = Bf[m,m]*(MJUD)[mp,:]
Y2[m,:] = -Sf[m,m]*(MJUD)[mp,:]
Y = [Y1
Y2]
g = im*tr(W\X)+ (transpose(W\V)*X*(W\V))[1] - (transpose(Y)*(W\V))[1]
s = s+g[1]*gfcpart*T[m,mp]
end
for i in 1:sysize
X11[m,i]=0
X12[m,i]=0
X21[m,i]=0
X22[m,i]=0
Y1[m,1]=0
Y2[m,1]=0
end
end
return s[1]
end
begin ## Define all constant variables
const Delta::Float64 = 2.02 # in eV
const dd ::Int64 = 16 # Change this to modify number of points
const Temp::Int64=300
kT = Temp * 1.380649 * 10^-23 # Boltzmann constant times temperature in Joules
kT= kT * 2.29371227840*10^17 # in atomic units
const b ::Float64= 1/kT
const omegaf ::Diagonal{Float64}= Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n')))*4.55633*10^-6 # Final state frequencies in atomic units
const omegai ::Diagonal{Float64}= Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n')))*4.55633*10^-6 # Initial state frequencies in atomic units
const sysize ::Int64 = size(omegai)[1]
const P = @. 2*sinh(b*omegai/2)
T ::Matrix{Float64}= readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
T = T*T';
const D ::Matrix{Float64}= readdlm("d.txt", Float64); # Displacement Matrix
const J ::Matrix{Float64}= readdlm("j.txt", Float64); # Duchinsky Matrix
t ::Vector{Float64}= collect(range(-5*10^-12,stop=5*10^-12,length=dd))
t[div(dd+1,2)] = 10e-25
end
function calc(t,b,Δ,sysize,Ωf,Ωi,J,D,P)
nchunks = Threads.nthreads() # define number of chunks e.g. equal to number of threads
# @sync tells Julia to wait for any tasks started inside it
open("THREAD-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "STARTING LOOP WITH NUMBER OF THREADS = $(Threads.nthreads()) \n====================================================\n")
end
x = zeros(ComplexF64,dd)
@time @sync for (xrange, ichunk) in chunks(range(1,dd), nchunks)
# xrange are the indices this chunk contains, ichunk is just the index of the chunk, which we don't need in this scenario
Threads.@spawn begin # this line tells Julia to start a new thread with the contents of the begin block
Sf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Si ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Bf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Bi ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
X11 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X12 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X21 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X22 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
Y1 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,1)
Y2 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,1)
for i in xrange
gfcpart = GFC(t[i],b,Δ,sysize,Ωf,Ωi,J,D,P,Si,Sf,Bi,Bf)
gtotal = GHT(t[i],b,sysize,Ωf,Ωi,J,D,Si,Sf,Bi,Bf,X11,X12,X21,X22,Y1,Y2,gfcpart,T)
x[i] = gtotal
open("THREAD-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "ITERATION $i COMPLETED BY THREAD $(Threads.threadid()) \n")
end
end # loop
end # thread work load
end # loop over chunks, Julia will wait until all task are completed
open("THREAD-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
p = plot(t,yr,linewidth = 2,color = "blue",title = "Δ = $Δ",label = "Real part",grid=false)
p =plot!(t,yi,linewidth = 2,color = "red",label = "Imaginary part",size=(1000,800),xlims=(-30*10^-15,30*10^-15))
savefig(p,"gt.svg")
begin # This part does FFTW and finds final rate constant
fs = abs(t[1]-t[2])^-1
signal = yr
p = plan_fft(signal)
F = fftshift(p*signal)
freqs = fftshift(fftfreq(length(t), fs)) # X axis has frequencies in s^-1
#Covert freqs to eV
freqs = freqs * 4.13558 * 10^-15
F = F * 6.57 * 10^15
time_domain = plot(t, signal, title = L"G(t)",legend=false,color="orange",xlabel=L"\mathrm{Time}\ (s)",ylabel=L"\mathrm{A.u}")
freq_domain = plot(freqs, abs.(F), title = L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}",legend=false,xlabel=L"\mathrm{Energy\ (eV)}",ylabel=L"\mathrm{k_{IC}\ (s^{-1})}",xlims=(0,4))
p = plot(time_domain, freq_domain, layout = (2,1), size = (1000, 800),right_margin = 10mm)
savefig(p,"gt-fft.svg")
ee = Δ# * 8065.73 #Converting eV to cm^-1
index = findmin(abs.(freqs .- ee))[2] # Find the index of freq closest to ee
freqs[index]
println("The rate constant is ",abs.(F)[index])
rc = abs.(F)[index]
end
begin # This part writes the output to a file
open("THREAD-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "-------------------------\n")
write(f,"NUMBER OF CORES = $(Threads.nthreads())\n")
write(f,"The value of Δ is $Δ\n")
write(f,"Number of time points = $dd\n")
write(f, "The rate constant is $rc\n")
write(f,"-------------------------\n")
write(f,"The values of t,Re(g(t)),Im(g(t)) are \n\n")
for i in range(1,dd)
write(f,"$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
end
end
timeneed = @elapsed calc(t,b,Delta,sysize,omegaf,omegai,J,D,P)
open("THREAD-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING GC + C + RC]= $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end
While running with 1 thread, it is taking 174
s with dd=16
. When I bump up the number of threads to 16 the total time taken for the expensive loop reduces to 42
seconds !
I know that we can never achieve perfect scaling, i.e when we just get a n
times speed boost. This scaling although twice as better than using Distributed.jl , it is still far off from the ideal. I am getting a ~4.17
times speed boost on using 16
times the threads. Is this kind of scaling expected from Base.Threads
? Or is there something still to be done to improve the scaling ?
I would like to also mention another peculiar observation : When I ran the serial code on my laptop, it took about 170
seconds, before the optimizations I posted about today. When I ran the same code on the supercomputer that uses Intel Xeon, it took 303
seconds. Now I know that the single core performance on supercomputers are not always their selling point. But when I ran our Fortran code on the same supercomputer using the same number of point it took 59
seconds, whereas on my pc the fortran code took 148
seconds. In conclusion running the code on the supercomputer increased the speed of Fortran code and decreased the speed of my Julia code on the same machine. This seems very counter intuitive.