I have been trying to write a version of my Julia code that can be run in a parallelized manner using multiple nodes at once. After posting this thread I finally have a working version of the code (as given below)

## Code that uses Distributed.jl

```
using Distributed
@everywhere begin
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using ChunkSplitters
BLAS.set_num_threads(1)
end
const to = TimerOutput()
println("Number of processes = $(Distributed.nworkers())")
open("ISC-DISTRIBUTED.TXT", "a") do f
write(f, "\n\n\n====================================================\n\n\n")
end
@everywhere function GDSO(
t::Float64,
b::Float64,
Ξ::Float64,
sysize::Int64,
Ξ©f::Diagonal{Float64},
Ξ©i::Diagonal{Float64},
J::Matrix{Float64},
D::Matrix{Float64},
Ο::Diagonal{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64}
)
t = t * 4.1341373335e16 # converting time to atomic units
tp = -im * b - t
for i = 1:sysize
Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
end
BB = (Bf + J' * Bi * J)
AA = (Sf + J' * Si * J)
W = [BB -AA
-AA BB] #We wont calculate the 2Nx2N determinant directly.
Wapp = BB * (BB - AA * (BB \ AA))
# U = Bi - Si
V = [J' * (Bi-Si) * D
J' * (Bi-Si) * D]
F1 = sqrt(det(Si * Sf * (Wapp \ Ο^2)))
F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
g = F1 * exp(F2)
Ξ = Ξ * 0.0367493 # in atomic units
g = g * exp(im * Ξ * t)
Ξ· = 2 * 4.55633e-6 # 10 cm^-1 to hatree energy
g = g * exp(-Ξ· * t^2)
return g[1]
end
@everywhere @views function GSV(
t::Float64,
b::Float64,
sysize::Int64,
Ξ©f::Diagonal{Float64},
Ξ©i::Diagonal{Float64},
J::Matrix{Float64},
D::Matrix{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64},
gdsopart::ComplexF64,
T::Matrix{Float64}, ## Product if NAC and SO
Hso::Float64,
X::Matrix{ComplexF64},
Y::Matrix{ComplexF64}
)
################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
t = t * 4.1341373335e16 # converting time to atomic units
tp = -im * b - t
for i = 1:sysize
Sf[i, i] = Ξ©f[i, i] / sin(Ξ©f[i, i] * t)
Si[i, i] = Ξ©i[i, i] / sin(Ξ©i[i, i] * tp)
Bf[i, i] = Ξ©f[i, i] / tan(Ξ©f[i, i] * t)
Bi[i, i] = Ξ©i[i, i] / tan(Ξ©i[i, i] * tp)
end
W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
-(Sf + J' * Si * J) (Bf+J'*Bi*J)]
# U = Bi - Si
V = [J' * (Bi-Si) * D
J' * (Bi-Si) * D]
########## Defining the X and Y Matrices ##################################
################# Final Calculation and returning value ##################
s = zero(ComplexF64)
MJSJ = J' * Si * J
MJBJ = J' * Bi * J
MJUD = J' * (Bi-Si) * D
Win = inv(W)
WinV = Win * V
XWin = similar(Win)
for m in 1:sysize
for mp in 1:sysize
X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m])
X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m])
X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m])
X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m])
Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1]
Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1]
g = im * (tr(mul!(XWin, X, Win)) + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
s = s + g * gdsopart * T[m, mp]
end
X[m, 1:sysize] .= 0
X[m, sysize+1:2*sysize] .= 0
X[sysize+m, 1:sysize] .= 0
X[sysize+m, sysize+1:2*sysize] .= 0
Y[m, 1] = 0
Y[m+sysize, 1] = 0
end
s = s + abs(Hso)^2 * gdsopart #### LATEST MODIFICATION
return s
end
@everywhere begin
const hso_s1t3 = 1.49 * 0.0000046 ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 41 # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23 # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17 # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Final state frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Initial state frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25
end
@everywhere function worker_function(xrange, t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso)
Sf = Diagonal(zeros(ComplexF64, sysize))
Si = Diagonal(zeros(ComplexF64, sysize))
Bf = Diagonal(zeros(ComplexF64, sysize))
Bi = Diagonal(zeros(ComplexF64, sysize))
X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
Y = zeros(ComplexF64, 2 * sysize, 1)
x_worker = zeros(ComplexF64, length(xrange))
for (local_xind,main_xind) in enumerate(xrange)
gdsopart = GDSO(t[main_xind], b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, Si, Sf, Bi, Bf)
gtotal = GSV(t[main_xind], b, sysize, Ξ©f, Ξ©i, J, D, Si, Sf, Bi, Bf, gdsopart, T, Hso, X, Y)
x_worker[local_xind] = gtotal
open("ISC-DISTRIBUTED.TXT", "a") do f
write(f, "ITERATION $main_xind COMPLETED BY PROCESS $(Distributed.myid()) \n")
end
end
return x_worker
end
function calc(t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P; nchunks=Distributed.nworkers())
open("ISC-DISTRIBUTED.TXT", "w") do f
write(f, "STARTING LOOP WITH NUMBER OF CORES = $(Distributed.nworkers()) \n====================================================\n")
end
x = zeros(ComplexF64, dd)
Hso = 0.48 * 0.0000046 ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
xrange_chunknum = chunks(range(1, dd), nchunks)
@timeit to "GT Calculation" x = @distributed vcat for ichunk in 1:nchunks
xrange,chunknum = xrange_chunknum[ichunk]
worker_function(xrange, t, b, Ξ, sysize, Ξ©f, Ξ©i, J, D, P, T, Hso)
end
open("ISC-DISTRIBUTED.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
p = plot(t, yr, linewidth=2, color="blue", title="Ξ = $Ξ", label="Real part", grid=false)
p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
savefig(p, "gt.svg")
begin # This part does FFTW and finds final rate constant
fs = abs(t[1] - t[2])^-1
signal = yr
p = plan_fft(signal)
F = fftshift(p * signal)
freqs = fftshift(fftfreq(length(t), fs)) # X axis has frequencies in s^-1
#Covert freqs to eV
freqs = freqs * 4.13558e-15
F = F * 6.57e15
time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
savefig(p, "gt-fft.svg")
ee = Ξ# * 8065.73 #Converting eV to cm^-1
index = findmin(abs.(freqs .- ee))[2] # Find the index of freq closest to ee
freqs[index]
println("The rate constant is ", abs.(F)[index])
rc = abs.(F)[index]
end
#################################################This part writes the output to a file#######################################################
begin
open("ISC-DISTRIBUTED.TXT", "a") do f
write(f, "-------------------------\n")
write(f, "NUMBER OF CORES = $(Distributed.nworkers())\n")
write(f, "The value of Ξ is $Ξ\n")
write(f, "Number of time points = $dd\n")
write(f, "The rate constant calculated from Julia FFTW = $rc\n")
write(f, "-------------------------\n")
write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
for i in 1:dd
write(f, "$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
open("GT-VALUES-DISTRIBTUED.txt", "w") do f
for i in 1:dd
write(f, "$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
end
end
calc(t, b, Delta, sysize, omegaf, omegai, J, D, P; nchunks=Distributed.nworkers())
display(to)
```

The aim is to develop a version of this code that runs faster than the Fortran equivalent. Keeping in mind the fact that the serial version of our code is already faster than the Fortran equivalent (see this thread), now the next thing to do is to make it capable of multi-node parallelization that the Fortran code achieves using MPI.

Previously, we have seen that the version of the code that uses `Base.Threads`

outperforms the MPI parallelized Fortran code when ran on a single node. But as you might already be about to point out, that is not a fair comparison since the threaded version cannot achieve multi node parallelization. For this the version using Distributed.jl was developed. But it seems that the scaling right now is pretty bad.

```
|Code Version | Number of Cores | Time Taken |
-------------------------------------------------------------------
|Julia [Base.Threads] | 40 | 39hrs |
|Julia [Distributed.jl] | 80 [40 + 40 using 2 nodes] | 31hrs |
|Fortran [Using MPI] | 80 [40 + 40 using 2 nodes] | <20 hrs |
```

I am aware that the Benchmarks provided are not ideal, it is not fair to expect the Distributed.jl version of the code (when using 80 total cores) to complete the task in half the amount of time taken by the `Base.Threads`

version (which was using 40 cores). As you can understand, the times are very long and hence not too many tests could be ran on the cluster I am using. Is the bad scaling in question a consequence of some bad code design I have implemented ? Or does it reflect the difference in speeds of MPI and Distributed.jl, do I have to use MPI.jl to get comparable speeds to this? Any advice on how to improve the scaling is greatly appreciated.