I have in the meantime gotten an implementation that purely uses MPI.jl ready and it seems to produce the correct output.
Using MPI.jl
#
using MPI
MPI.Init()
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using TensorOperations
using TimerOutputs
using ChunkSplitters
BLAS.set_num_threads(1)
const to = TimerOutput()
@views function GDSO(
t::Float64,
b::Float64,
Δ::Float64,
sysize::Int64,
Ωf::Diagonal{Float64},
Ωi::Diagonal{Float64},
J::Matrix{Float64},
D::Vector{Float64},
ρ::Diagonal{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64},
JpBi::Matrix{ComplexF64},
JpSi::Matrix{ComplexF64},
BB::Matrix{ComplexF64},
AA::Matrix{ComplexF64},
W::Matrix{ComplexF64},
Wapp::Matrix{ComplexF64},
Wapp_i1::Matrix{ComplexF64},
V::Vector{ComplexF64},
V_int1::Vector{ComplexF64},
V_int2::Matrix{ComplexF64},
BimSi::Diagonal{ComplexF64}
)
t = t * 4.1341373335e16 # converting time to atomic units
tp = -im * b - t
for i = 1:sysize
Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
end
# BB = (Bf + J' * Bi * J)
# AA = (Sf + J' * Si * J)
mul!(JpBi,J',Bi)
mul!(BB,JpBi,J)
BB .+= Bf
mul!(JpSi,J',Si)
mul!(AA,JpSi,J)
AA .+= Sf
# W = [BB -AA
# -AA BB]
W[1:sysize,1:sysize] .= BB
W[1:sysize,sysize+1:2*sysize] .= -AA
W[sysize+1:2*sysize,1:sysize] .= -AA
W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB
#We wont calculate the 2Nx2N determinant directly.
# Wapp = BB * (BB - AA * (BB \ AA))
BBinvAA = BB \ AA
mul!(Wapp_i1,AA,BBinvAA,-1,0)
Wapp_i1 .+= BB
mul!(Wapp,BB,Wapp_i1)
# V = [J' * (Bi-Si) * D
# J' * (Bi-Si) * D]
BimSi .= Bi - Si
mul!(V_int1,BimSi,D)
mul!(view(V, 1:sysize), J', V_int1)
V[sysize+1:end] .= V[1:sysize]
# mul!(V_int2,J',V_int1)
# V[1:sysize,1] .= V_int2
# V[sysize+1:2*sysize,1] .= V_int2
F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
g = F1 * exp(F2)
Δ = Δ * 0.0367493 # in atomic units
g = g * exp(im * Δ * t)
η = 2 * 4.55633e-6 # 10 cm^-1 to hatree energy
g = g * exp(-η * abs(t))
return g[1]
end
@views function GSV(
t::Float64,
b::Float64,
sysize::Int64,
Ωf::Diagonal{Float64},
Ωi::Diagonal{Float64},
J::Matrix{Float64},
D::Vector{Float64},
Si::Diagonal{ComplexF64},
Sf::Diagonal{ComplexF64},
Bi::Diagonal{ComplexF64},
Bf::Diagonal{ComplexF64},
gdsopart::ComplexF64,
T::Matrix{Float64}, ## Product if NAC and SO
Hso::Float64,
X::Matrix{ComplexF64},
Y::Vector{ComplexF64},
W::Matrix{ComplexF64},
V::Vector{ComplexF64},
WinV::Vector{ComplexF64},
XWin::Matrix{ComplexF64}
)
################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
t = t * 4.1341373335e16 # converting time to atomic units
# tp = -im * b - t
# for i = 1:sysize
# Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
# Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
# Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
# Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
# end
# W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
# -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
# # U = Bi - Si
# V = [J' * (Bi-Si) * D
# J' * (Bi-Si) * D]
################# Final Calculation and returning value ##################
@timeit to "Pre Loop Allocations" begin
s = zero(ComplexF64)
MJSJ = J' * Si * J
MJBJ = J' * Bi * J
MJUD = J' * (Bi-Si) * D
Win = inv(W)
mul!(WinV,Win,V)
end
@timeit to "Most Expensive Loop" begin
for m in 1:sysize
for mp in 1:sysize
########## Defining the X and Y Matrices ##################################
@timeit to "X Y Matrices Writing" begin
X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m])
X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m])
X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m])
X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m])
Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1]
Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1]
end
@timeit to "Final Multiplications" begin
# g = im * (tr(mul!(XWin, X, Win)) + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
g = im * (tr(mul!(XWin, X, Win)) + dot(WinV, X, WinV) - dot(Y, WinV))
s = s + g * gdsopart * T[m, mp]
end
end
X[m, 1:sysize] .= 0
X[m, sysize+1:2*sysize] .= 0
X[sysize+m, 1:sysize] .= 0
X[sysize+m, sysize+1:2*sysize] .= 0
Y[m, 1] = 0
Y[m+sysize, 1] = 0
end
end
s = s + abs(Hso)^2 * gdsopart #### LATEST MODIFICATION
return s
end
const hso_s1t3 = 1.49 * 0.0000046 ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 22 # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23 # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17 # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Final state frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6 # Initial state frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64)[:,1] # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25
function worker_function(xrange, t, b, Δ, sysize, Ωf, Ωi, J, D, P, T, Hso,mpi_rank,mpi_size)
Sf = Diagonal(zeros(ComplexF64, sysize))
Si = Diagonal(zeros(ComplexF64, sysize))
Bf = Diagonal(zeros(ComplexF64, sysize))
Bi = Diagonal(zeros(ComplexF64, sysize))
X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
Y = zeros(ComplexF64, 2 * sysize)
BB = zeros(ComplexF64, sysize, sysize)
AA = zeros(ComplexF64, sysize, sysize)
JpBi = zeros(ComplexF64, sysize, sysize)
JpSi = zeros(ComplexF64, sysize, sysize)
W = zeros(ComplexF64, 2 * sysize, 2 * sysize)
Wapp = zeros(ComplexF64, sysize, sysize)
Wapp_i1 = zeros(ComplexF64, sysize, sysize)
V = zeros(ComplexF64, 2 * sysize)
V_int1 = zeros(ComplexF64, sysize)
V_int2 = zeros(ComplexF64, sysize,1)
BimSi = Diagonal(zeros(ComplexF64, sysize, sysize))
WinV = zeros(ComplexF64, 2 * sysize)
WinV_transpose = zeros(ComplexF64, 1, 2 * sysize)
XWin = zeros(ComplexF64, 2 * sysize, 2 * sysize)
Hso = 0.48 * 0.0000046 ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
x_worker = zeros(ComplexF64, length(xrange))
for (local_xind,main_xind) in enumerate(xrange)
gdsopart = GDSO(t[main_xind], b, Δ, sysize, Ωf, Ωi, J, D, P, Si, Sf, Bi, Bf, JpBi, JpSi, BB, AA, W, Wapp, Wapp_i1, V, V_int1, V_int2, BimSi)
gtotal = GSV(t[main_xind], b, sysize, Ωf, Ωi, J, D, Si, Sf, Bi, Bf, gdsopart, T, Hso, X, Y, W, V, WinV, XWin)
x_worker[local_xind] = gtotal
open("ISC-MPI.TXT", "a") do f
write(f, "ITERATION $main_xind COMPLETED BY PROCESS $(mpi_rank) \n")
end
end
return x_worker
end
function calc(t, b, Δ, sysize, Ωf, Ωi, J, D, P)
comm = MPI.COMM_WORLD
mpi_rank = MPI.Comm_rank(comm)
mpi_size = MPI.Comm_size(comm)
open("ISC-MPI.TXT", "w") do f
write(f, "STARTING LOOP WITH NUMBER OF CORES = $(mpi_size) \n====================================================\n")
end
Hso = 0.48 * 0.0000046 ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
xrange_chunknum = chunks(range(1, dd), mpi_size)
@timeit to "GT Calculation" begin
# for ichunk in 1:nchunks
# xrange,chunknum = xrange_chunknum[ichunk]
# inter_result = worker_function(xrange, t, b, Δ, sysize, Ωf, Ωi, J, D, P, T, Hso,mpi_rank,mpi_size)
# end
xrange,chunknum = xrange_chunknum[mpi_rank+1]
inter_result = worker_function(xrange, t, b, Δ, sysize, Ωf, Ωi, J, D, P, T, Hso,mpi_rank,mpi_size)
MPI.Barrier(comm)
x = MPI.gather(inter_result,comm,root=0)
end
if mpi_rank == 0
x = vcat(x...)
println("LOOP COMPLETE")
open("ISC-MPI.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
begin
open("ISC-MPI.TXT", "a") do f
write(f, "-------------------------\n")
write(f, "NUMBER OF CORES = $(1)\n")
write(f, "The value of Δ is $Δ\n")
write(f, "Number of time points = $dd\n")
write(f, "Central Value = $(yr[div(dd + 1, 2)])\n")
write(f, "-------------------------\n")
write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
for i in 1:dd
write(f, "$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
open("GT-MPI.TXT", "w") do f
for i in 1:dd
write(f, "$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
end
display(to)
end
end
calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
MPI.Finalize()
However, the initial benchmarks indicate that it is rather quite a bit slower than the one using Distributed.jl. The first one is for the time taken by MPI.jl implementation and the later one is for Distributed.jl.
anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc/version august 2024$ mpiexec --use-hwthread-cpus -n 11 julia
mpi_parallel_timeloop.jl
LOOP COMPLETE
────────────────────────────────────────────────────────────────────────────────
Time Allocations
─────────────────────── ────────────────────────
Tot / % measured: 173s / 90.1% 903MiB / 4.6%
Section ncalls time %tot avg alloc %tot avg
────────────────────────────────────────────────────────────────────────────────
GT Calculation 1 156s 100.0% 156s 41.1MiB 100.0% 41.1MiB
Most Expensive Loop 19 114s 73.1% 6.01s 1.47KiB 0.0% 79.2B
Final Multipli... 107k 114s 72.8% 1.06ms 0.00B 0.0% 0.00B
X Y Matrices W... 107k 336ms 0.2% 3.14μs 0.00B 0.0% 0.00B
Pre Loop Allocat... 19 43.1ms 0.0% 2.27ms 15.9MiB 38.8% 859KiB
────────────────────────────────────────────────────────────────────────────────
anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc/version august 2024$ julia -p 11 2distribute.jl
Number of processes = 11
The rate constant is 41993.588419884196
───────────────────────────────────────────────────────────────────────────
Time Allocations
─────────────────────── ────────────────────────
Tot / % measured: 149s / 95.2% 656MiB / 7.4%
Section ncalls time %tot avg alloc %tot avg
───────────────────────────────────────────────────────────────────────────
GT Calculation 1 142s 100.0% 142s 48.3MiB 100.0% 48.3MiB
───────────────────────────────────────────────────────────────────────────
I will try in the meantime to get a full dd=20001
benchmark running on my cluster to compare the results, as I have seen that the order of speeds can vary severely between my personal computer and the cluster.
As for this:
How exactly can I trigger the profiler on all processes ?
EDIT 1 - There seems to be a problem with getting MPI.jl to play nicely with my cluster, I have opened another thread for the error I am facing.