How to improve the scaling of Julia code aimed at multi-node parallelization?

I have in the meantime gotten an implementation that purely uses MPI.jl ready and it seems to produce the correct output.

Using MPI.jl
#



using MPI
MPI.Init()
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using TensorOperations
using TimerOutputs
using ChunkSplitters

BLAS.set_num_threads(1)
const to = TimerOutput()


@views function GDSO(
    t::Float64,
    b::Float64,
    Δ::Float64,
    sysize::Int64,
    Ωf::Diagonal{Float64},
    Ωi::Diagonal{Float64},
    J::Matrix{Float64},
    D::Vector{Float64},
    ρ::Diagonal{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    JpBi::Matrix{ComplexF64},
    JpSi::Matrix{ComplexF64},
    BB::Matrix{ComplexF64},
    AA::Matrix{ComplexF64},
    W::Matrix{ComplexF64},
    Wapp::Matrix{ComplexF64},
    Wapp_i1::Matrix{ComplexF64},
    V::Vector{ComplexF64},
    V_int1::Vector{ComplexF64},
    V_int2::Matrix{ComplexF64},
    BimSi::Diagonal{ComplexF64}
    
)
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
        Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
        Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
        Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
    end
    # BB = (Bf + J' * Bi * J)
    # AA = (Sf + J' * Si * J)
    mul!(JpBi,J',Bi)
    mul!(BB,JpBi,J)
    BB .+= Bf
    mul!(JpSi,J',Si)
    mul!(AA,JpSi,J)
    AA .+= Sf

    # W = [BB -AA
    #     -AA BB] 
    W[1:sysize,1:sysize] .= BB
    W[1:sysize,sysize+1:2*sysize] .= -AA
    W[sysize+1:2*sysize,1:sysize] .= -AA
    W[sysize+1:2*sysize,sysize+1:2*sysize] .= BB

    #We wont calculate the 2Nx2N determinant directly.
    # Wapp = BB * (BB - AA * (BB \ AA))
    BBinvAA = BB \ AA
    mul!(Wapp_i1,AA,BBinvAA,-1,0)
    Wapp_i1 .+= BB
    mul!(Wapp,BB,Wapp_i1)

    # V = [J' * (Bi-Si) * D
    #      J' * (Bi-Si) * D]
    BimSi .= Bi - Si
    mul!(V_int1,BimSi,D)
    mul!(view(V, 1:sysize), J', V_int1)
    V[sysize+1:end] .=  V[1:sysize]
    # mul!(V_int2,J',V_int1)
    # V[1:sysize,1] .= V_int2
    # V[sysize+1:2*sysize,1] .= V_int2

    F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
    F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * (Bi-Si) * D))
    g = F1 * exp(F2)
    Δ = Δ * 0.0367493  # in atomic units
    g = g * exp(im * Δ * t)
    η = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
    g = g * exp(-η * abs(t))
    return g[1]
end

@views function GSV(
    t::Float64,
    b::Float64,
    sysize::Int64,
    Ωf::Diagonal{Float64},
    Ωi::Diagonal{Float64},
    J::Matrix{Float64},
    D::Vector{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    gdsopart::ComplexF64,
    T::Matrix{Float64},       ## Product if NAC and SO 
    Hso::Float64,
    X::Matrix{ComplexF64},
    Y::Vector{ComplexF64},
    W::Matrix{ComplexF64},
    V::Vector{ComplexF64},
    WinV::Vector{ComplexF64},
    XWin::Matrix{ComplexF64}
)

    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t * 4.1341373335e16                     # converting time to atomic units
    # tp = -im * b - t
    # for i = 1:sysize
    #     Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
    #     Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
    #     Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
    #     Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
    # end
    # W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
    #     -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
    # # U = Bi - Si
    # V = [J' * (Bi-Si) * D
    #     J' * (Bi-Si) * D]




    ################# Final Calculation and returning value ##################
    @timeit to "Pre Loop Allocations" begin
    s = zero(ComplexF64)
    MJSJ = J' * Si * J
    MJBJ = J' * Bi * J
    MJUD = J' * (Bi-Si) * D
    Win = inv(W)
    mul!(WinV,Win,V)
    end
    @timeit to "Most Expensive Loop" begin
    for m in 1:sysize
        for mp in 1:sysize
            ########## Defining the X and Y Matrices ##################################
            @timeit to "X Y Matrices Writing" begin
            X[m, 1:sysize] .= (MJSJ)[mp, :] .* (-Bf[m, m])
            X[m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (Bf[m, m])
            X[sysize+m, 1:sysize] .= (MJSJ)[mp, :] .* (Sf[m, m])
            X[sysize+m, sysize+1:2*sysize] .= (MJBJ)[mp, :] .* (-Sf[m, m])

            Y[m, 1] = Bf[m, m] * (MJUD)[mp, 1]
            Y[m+sysize, 1] = -Sf[m, m] * (MJUD)[mp, 1]
            end
            @timeit to "Final Multiplications" begin

            # g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
            g = im * (tr(mul!(XWin, X, Win))  + dot(WinV, X, WinV) - dot(Y, WinV))
            


            s = s + g * gdsopart * T[m, mp]
            end

        end
        X[m, 1:sysize] .= 0
        X[m, sysize+1:2*sysize] .= 0
        X[sysize+m, 1:sysize] .= 0
        X[sysize+m, sysize+1:2*sysize] .= 0
        Y[m, 1] = 0
        Y[m+sysize, 1] = 0
    end
end
    s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
    return s
end

const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 22                            # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17         # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64)[:,1] # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25


function worker_function(xrange, t, b, Δ, sysize, Ωf, Ωi, J, D, P, T, Hso,mpi_rank,mpi_size)
    Sf = Diagonal(zeros(ComplexF64, sysize))
    Si = Diagonal(zeros(ComplexF64, sysize))
    Bf = Diagonal(zeros(ComplexF64, sysize))
    Bi = Diagonal(zeros(ComplexF64, sysize))
    X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Y = zeros(ComplexF64, 2 * sysize)
    BB = zeros(ComplexF64, sysize, sysize)
    AA = zeros(ComplexF64, sysize, sysize)
    JpBi = zeros(ComplexF64, sysize, sysize)
    JpSi = zeros(ComplexF64, sysize, sysize)
    W = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Wapp = zeros(ComplexF64, sysize, sysize)
    Wapp_i1 = zeros(ComplexF64, sysize, sysize)
    V =  zeros(ComplexF64, 2 * sysize)
    V_int1 = zeros(ComplexF64, sysize)
    V_int2 = zeros(ComplexF64, sysize,1)
    BimSi = Diagonal(zeros(ComplexF64, sysize, sysize))
    WinV = zeros(ComplexF64, 2 * sysize)
    WinV_transpose = zeros(ComplexF64, 1, 2 * sysize)
    XWin = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
    x_worker = zeros(ComplexF64, length(xrange))
    for (local_xind,main_xind) in enumerate(xrange)
        gdsopart = GDSO(t[main_xind], b, Δ, sysize, Ωf, Ωi, J, D, P, Si, Sf, Bi, Bf, JpBi, JpSi, BB, AA, W, Wapp, Wapp_i1, V, V_int1, V_int2, BimSi)
        gtotal = GSV(t[main_xind], b, sysize, Ωf, Ωi, J, D, Si, Sf, Bi, Bf, gdsopart, T, Hso, X, Y, W, V, WinV, XWin)
        x_worker[local_xind] = gtotal
        open("ISC-MPI.TXT", "a") do f
            write(f, "ITERATION $main_xind COMPLETED BY PROCESS $(mpi_rank) \n")
        end
    end
    return x_worker
end




function calc(t, b, Δ, sysize, Ωf, Ωi, J, D, P)
    comm = MPI.COMM_WORLD
    mpi_rank = MPI.Comm_rank(comm)
    mpi_size = MPI.Comm_size(comm)
    open("ISC-MPI.TXT", "w") do f
        write(f, "STARTING LOOP WITH NUMBER OF CORES = $(mpi_size) \n====================================================\n")
    end

    Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
    xrange_chunknum = chunks(range(1, dd), mpi_size)
    @timeit to "GT Calculation" begin  
        # for ichunk in 1:nchunks
        #     xrange,chunknum = xrange_chunknum[ichunk]
        #     inter_result = worker_function(xrange, t, b, Δ, sysize, Ωf, Ωi, J, D, P, T, Hso,mpi_rank,mpi_size)
        # end
        xrange,chunknum = xrange_chunknum[mpi_rank+1]
        inter_result = worker_function(xrange, t, b, Δ, sysize, Ωf, Ωi, J, D, P, T, Hso,mpi_rank,mpi_size)
        MPI.Barrier(comm)
        x = MPI.gather(inter_result,comm,root=0)

    end


    if mpi_rank == 0
        x = vcat(x...)
        println("LOOP COMPLETE")
        open("ISC-MPI.TXT", "a") do f
            write(f, "LOOP COMPLETE \n====================================================")
        end
        yr = real.(x)
        yi = imag.(x)
    



    begin 
        open("ISC-MPI.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f, "NUMBER OF CORES = $(1)\n")
            write(f, "The value of Δ is $Δ\n")
            write(f, "Number of time points = $dd\n")
            write(f, "Central Value = $(yr[div(dd + 1, 2)])\n")
            write(f, "-------------------------\n")
            write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

        open("GT-MPI.TXT", "w") do f
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

    end
    display(to)
    end
end


calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
MPI.Finalize()


However, the initial benchmarks indicate that it is rather quite a bit slower than the one using Distributed.jl. The first one is for the time taken by MPI.jl implementation and the later one is for Distributed.jl.

anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc/version august 2024$ mpiexec --use-hwthread-cpus -n 11 julia
 mpi_parallel_timeloop.jl 
LOOP COMPLETE
 ────────────────────────────────────────────────────────────────────────────────
                                        Time                    Allocations      
                               ───────────────────────   ────────────────────────
       Tot / % measured:             173s /  90.1%            903MiB /   4.6%    

 Section               ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────
 GT Calculation             1     156s  100.0%    156s   41.1MiB  100.0%  41.1MiB
   Most Expensive Loop     19     114s   73.1%   6.01s   1.47KiB    0.0%    79.2B
     Final Multipli...   107k     114s   72.8%  1.06ms     0.00B    0.0%    0.00B
     X Y Matrices W...   107k    336ms    0.2%  3.14μs     0.00B    0.0%    0.00B
   Pre Loop Allocat...     19   43.1ms    0.0%  2.27ms   15.9MiB   38.8%   859KiB
 ────────────────────────────────────────────────────────────────────────────────
anik@anik-MSI:~/Desktop/Swapan Sir/generating-function-codes/julia-codes/isc/version august 2024$ julia -p 11 2distribute.jl 
Number of processes = 11
The rate constant is 41993.588419884196
 ───────────────────────────────────────────────────────────────────────────
                                   Time                    Allocations      
                          ───────────────────────   ────────────────────────
     Tot / % measured:          149s /  95.2%            656MiB /   7.4%    

 Section          ncalls     time    %tot     avg     alloc    %tot      avg
 ───────────────────────────────────────────────────────────────────────────
 GT Calculation        1     142s  100.0%    142s   48.3MiB  100.0%  48.3MiB
 ───────────────────────────────────────────────────────────────────────────

I will try in the meantime to get a full dd=20001 benchmark running on my cluster to compare the results, as I have seen that the order of speeds can vary severely between my personal computer and the cluster.

As for this:

How exactly can I trigger the profiler on all processes ?

EDIT 1 - There seems to be a problem with getting MPI.jl to play nicely with my cluster, I have opened another thread for the error I am facing.