This post can be thought as a successor to an old series of posts that I had made where the motive was to improve the performance and eventually parallelize a code that consisted primarily of matrix operations, the end goal being to write a code that can run faster than the equivalent Fortran
code. We were more or less successful in our endeavor. As per the advice of the community members, I had used Base.Threads
to attain a thread-level parallelization. Below is the current code that I am using
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using Base.Threads
using SharedArrays
using ChunkSplitters
BLAS.set_num_threads(1)
println("Number of threads = $(Threads.nthreads())")
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
write(f, "\n\n\n====================================================\n\n\n")
end
function GDSO(
t::Float64,
b::Float64,
Δ::Float64,
sysize::Int64,
Ωf::Diagonal{Float64},
Ωi::Diagonal{Float64},
J::Matrix{Float64},
D::Matrix{Float64},
ρ::Diagonal{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64}
)
t = t * 4.1341373335e16 # converting time to atomic units
tp = -im * b - t
for i = 1:sysize
Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
end
BB = (Bf + J' * Bi * J)
AA = (Sf + J' * Si * J)
W = [BB -AA
-AA BB] #We wont calculate the 2Nx2N determinant directly.
# Wapp = BB*(BB- AA*(BB^-1)*AA )
Wapp = BB * (BB - AA * (BB \ AA))
U = Bi - Si
V = [J' * U * D
J' * U * D]
# F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
# F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * U * D))
g = F1 * exp(F2)
Δ = Δ * 0.0367493 # in atomic units
g = g * exp(im * Δ * t)
η = 2 * 4.55633e-6 # 10 cm^-1 to hatree energy
#g = g * exp(-η * abs(t)) # Adding the damping factor Lorrentzian
g = g * exp(-η * t^2) # Adding the damping factor Gaussian
# println("t = $t was evaluated by process $(myid())")
return g[1]
end
@views function GSV(
t::Float64,
b::Float64,
sysize::Int64,
Ωf::Diagonal{Float64},
Ωi::Diagonal{Float64},
J::Matrix{Float64},
D::Matrix{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64},
X11::Matrix{ComplexF64},
X12::Matrix{ComplexF64},
X21::Matrix{ComplexF64},
X22::Matrix{ComplexF64},
Y1::Matrix{ComplexF64},
Y2::Matrix{ComplexF64},
gdsopart::ComplexF64,
T::Matrix{Float64}, ## Product if NAC and SO
Hso::Float64
)
################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
t = t * 4.1341373335e16 # converting time to atomic units
tp = -im * b - t
for i = 1:sysize
Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
end
W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
-(Sf + J' * Si * J) (Bf+J'*Bi*J)]
U = Bi - Si
V = [J' * U * D
J' * U * D]
########## Defining the X and Y Matrices ##################################
################# Final Calculation and returning value ##################
s = zero(ComplexF64)
MJSJ = J' * Si * J
MJBJ = J' * Bi * J
MJUD = J' * U * D
Win = inv(W)
WinV = Win * V
X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
Y = zeros(ComplexF64, 2 * sysize, 1)
XWin = similar(Win)
for m in 1:sysize
for mp in 1:sysize
X11[m, :] .= (MJSJ)[mp, :] .* (-Bf[m, m])
X12[m, :] .= (MJBJ)[mp, :] .* (Bf[m, m])
X21[m, :] .= (MJSJ)[mp, :] .* (Sf[m, m])
X22[m, :] .= (MJBJ)[mp, :] .* (-Sf[m, m])
# X = [X11 X12
# X21 X22]
X[1:sysize, 1:sysize] .= X11
X[1:sysize, sysize+1:2*sysize] .= X12
X[sysize+1:2*sysize, 1:sysize] .= X21
X[sysize+1:2*sysize, sysize+1:2*sysize] .= X22
Y1[m, :] .= Bf[m, m] * (MJUD)[mp, :]
Y2[m, :] .= -Sf[m, m] * (MJUD)[mp, :]
Y[1:sysize, 1] .= Y1
Y[sysize+1:2*sysize, 1] .= Y2
g = im * (tr(mul!(XWin, X, Win)) + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
s = s + g * gdsopart * T[m, mp]
end
X11[m, 1:sysize] .= 0
X12[m, 1:sysize] .= 0
X21[m, 1:sysize] .= 0
X22[m, 1:sysize] .= 0
Y1[m, 1] = 0
Y2[m, 1] = 0
end
s = s + abs(Hso)^2 * gdsopart #### LATEST MODIFICATION
return s
end
const hso_s1t3 = 1.49 * 0.0000046 ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 11 # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23 # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17 # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Final state frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Initial state frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25
function calc(t, b, Δ, sysize, Ωf, Ωi, J, D, P; nchunks=Threads.nthreads())
# @sync tells Julia to wait for any tasks started inside it
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
write(f, "STARTING LOOP WITH NUMBER OF THREADS = $(Threads.nthreads()) \n====================================================\n")
end
x = zeros(ComplexF64, dd)
@time @sync for (xrange, ichunk) in chunks(range(1, dd), nchunks)
# xrange are the indices this chunk contains, ichunk is just the index of the chunk, which we don't need in this scenario
Threads.@spawn begin # this line tells Julia to start a new thread with the contents of the begin block
Sf = Diagonal(zeros(ComplexF64, sysize))
Si = Diagonal(zeros(ComplexF64, sysize))
Bf = Diagonal(zeros(ComplexF64, sysize))
Bi = Diagonal(zeros(ComplexF64, sysize))
X11 = zeros(ComplexF64, sysize, sysize)
X12 = zeros(ComplexF64, sysize, sysize)
X21 = zeros(ComplexF64, sysize, sysize)
X22 = zeros(ComplexF64, sysize, sysize)
Y1 = zeros(ComplexF64, sysize, 1)
Y2 = zeros(ComplexF64, sysize, 1)
Hso = 0.48 * 0.0000046 ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
for i in xrange
gdsopart = GDSO(t[i], b, Δ, sysize, Ωf, Ωi, J, D, P, Si, Sf, Bi, Bf)
gtotal = GSV(t[i], b, sysize, Ωf, Ωi, J, D, Si, Sf, Bi, Bf, X11, X12, X21, X22, Y1, Y2, gdsopart, T,Hso)
x[i] = gtotal
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
write(f, "ITERATION $i COMPLETED BY THREAD $(Threads.threadid()) \n")
end
end # loop
end # thread work load
end # loop over chunks, Julia will wait until all task are completed
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
p = plot(t, yr, linewidth=2, color="blue", title="Δ = $Δ", label="Real part", grid=false)
p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
savefig(p, "gt.svg")
begin # This part does FFTW and finds final rate constant
fs = abs(t[1] - t[2])^-1
signal = yr
p = plan_fft(signal)
F = fftshift(p * signal)
freqs = fftshift(fftfreq(length(t), fs)) # X axis has frequencies in s^-1
#Covert freqs to eV
freqs = freqs * 4.13558e-15
F = F * 6.57e15
time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
savefig(p, "gt-fft.svg")
ee = Δ# * 8065.73 #Converting eV to cm^-1
index = findmin(abs.(freqs .- ee))[2] # Find the index of freq closest to ee
freqs[index]
println("The rate constant is ", abs.(F)[index])
rc = abs.(F)[index]
end
#################################################This part writes the output to a file#######################################################
begin
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
write(f, "-------------------------\n")
write(f, "NUMBER OF CORES = $(Threads.nthreads())\n")
write(f, "The value of Δ is $Δ\n")
write(f, "Number of time points = $dd\n")
write(f, "The rate constant calculated from Julia FFTW = $rc\n")
write(f, "-------------------------\n")
write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
for i in 1:dd
write(f, "$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
open("GT-VALUES-A1.txt", "w") do f
for i in 1:dd
write(f, "$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
end
end
timeneed = @elapsed calc(t, b, Delta, sysize, omegaf, omegai, J, D, P; nchunks=Threads.nthreads())
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING GC + C + RC]= $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end
The above code served it purpose, but now I am faced with the challenge of tackling systems where the variable sysize
becomes very large, causing the nested loop within the function GSV
to become extremely time consuming to the point that now I would benefit from the possibility of using multiple CPUs with each CPU having 40 cores. To the best of my knowledge, the Distrubuted
package can be used to achieve the same. In this code, as you can see, the variables like X11
X12
etc have been used as workspaces and thus I have provided each thread with a copy of them and have told each thread to work with only a part of the array t
using ChunkSplitters,jl
.
Now, how do I convert this code into one that uses Distributed.jl
instead and can thus be scaled up to multiple CPUs. Which variables do I need to declare as @everywhere
? Are there any better alternatives than using Distributed.jl
? And most importantly, how to rewrite the part that uses the Threads.@spawn
macro and thereby supplies one copy of the workspace variables to each thread in the Distributed.jl
regime. I am a complete novice when it comes to parallel computing and programming in general, so any help is highly appreciated ! If anyone wants to run the current working code (attached above) in their system, you may use the sample data files given here in this Google Drive link.