How to convert a thread-parallelized code into a core-parallelized code?

This post can be thought as a successor to an old series of posts that I had made where the motive was to improve the performance and eventually parallelize a code that consisted primarily of matrix operations, the end goal being to write a code that can run faster than the equivalent Fortran code. We were more or less successful in our endeavor. As per the advice of the community members, I had used Base.Threads to attain a thread-level parallelization. Below is the current code that I am using

using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using Base.Threads
using SharedArrays
using ChunkSplitters
BLAS.set_num_threads(1)

println("Number of threads = $(Threads.nthreads())")
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
    write(f, "\n\n\n====================================================\n\n\n")
end

function GDSO(
    t::Float64,
    b::Float64,
    Δ::Float64,
    sysize::Int64,
    Ωf::Diagonal{Float64},
    Ωi::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    ρ::Diagonal{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64}
)
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
        Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
        Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
        Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
    end
    BB = (Bf + J' * Bi * J)
    AA = (Sf + J' * Si * J)

    W = [BB -AA
        -AA BB] #We wont calculate the 2Nx2N determinant directly.

    # Wapp = BB*(BB-  AA*(BB^-1)*AA  )
    Wapp = BB * (BB - AA * (BB \ AA))
    U = Bi - Si
    V = [J' * U * D
        J' * U * D]
    # F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
    # F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
    F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
    F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * U * D))
    g = F1 * exp(F2)
    Δ = Δ * 0.0367493  # in atomic units
    g = g * exp(im * Δ * t)
    η = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
    #g = g * exp(-η * abs(t))     # Adding the damping factor Lorrentzian
    g = g * exp(-η * t^2)     # Adding the damping factor Gaussian
    # println("t = $t was evaluated by process $(myid())")
    return g[1]
end

@views function GSV(
    t::Float64,
    b::Float64,
    sysize::Int64,
    Ωf::Diagonal{Float64},
    Ωi::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    X11::Matrix{ComplexF64},
    X12::Matrix{ComplexF64},
    X21::Matrix{ComplexF64},
    X22::Matrix{ComplexF64},
    Y1::Matrix{ComplexF64},
    Y2::Matrix{ComplexF64},
    gdsopart::ComplexF64,
    T::Matrix{Float64},       ## Product if NAC and SO 
    Hso::Float64
)

    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
        Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
        Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
        Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
    end
    W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
        -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
    U = Bi - Si
    V = [J' * U * D
        J' * U * D]

    ########## Defining the X and Y Matrices ##################################


    ################# Final Calculation and returning value ##################
    s = zero(ComplexF64)
    MJSJ = J' * Si * J
    MJBJ = J' * Bi * J
    MJUD = J' * U * D
    Win = inv(W)
    WinV = Win * V
    X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Y = zeros(ComplexF64, 2 * sysize, 1)
    XWin = similar(Win)
    for m in 1:sysize
        for mp in 1:sysize
            X11[m, :] .= (MJSJ)[mp, :] .* (-Bf[m, m])
            X12[m, :] .= (MJBJ)[mp, :] .* (Bf[m, m])
            X21[m, :] .= (MJSJ)[mp, :] .* (Sf[m, m])
            X22[m, :] .= (MJBJ)[mp, :] .* (-Sf[m, m])
            #            X = [X11 X12
            #                X21 X22]
            X[1:sysize, 1:sysize] .= X11
            X[1:sysize, sysize+1:2*sysize] .= X12
            X[sysize+1:2*sysize, 1:sysize] .= X21
            X[sysize+1:2*sysize, sysize+1:2*sysize] .= X22

            Y1[m, :] .= Bf[m, m] * (MJUD)[mp, :]
            Y2[m, :] .= -Sf[m, m] * (MJUD)[mp, :]
            Y[1:sysize, 1] .= Y1
            Y[sysize+1:2*sysize, 1] .= Y2
            g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
            s = s + g * gdsopart * T[m, mp] 
        end
        X11[m, 1:sysize] .= 0
        X12[m, 1:sysize] .= 0
        X21[m, 1:sysize] .= 0
        X22[m, 1:sysize] .= 0
        Y1[m, 1] = 0
        Y2[m, 1] = 0
    end
    s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
    return s
end

const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 11                              # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17         # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25

function calc(t, b, Δ, sysize, Ωf, Ωi, J, D, P; nchunks=Threads.nthreads())
    # @sync tells Julia to wait for any tasks started inside it
    open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
        write(f, "STARTING LOOP WITH NUMBER OF THREADS = $(Threads.nthreads()) \n====================================================\n")
    end
    x = zeros(ComplexF64, dd)
    @time @sync for (xrange, ichunk) in chunks(range(1, dd), nchunks)
        # xrange are the indices this chunk contains, ichunk is just the index of the chunk, which we don't need in this scenario
        Threads.@spawn begin # this line tells Julia to start a new thread with the contents of the begin block
            Sf = Diagonal(zeros(ComplexF64, sysize))
            Si = Diagonal(zeros(ComplexF64, sysize))
            Bf = Diagonal(zeros(ComplexF64, sysize))
            Bi = Diagonal(zeros(ComplexF64, sysize))
            X11 = zeros(ComplexF64, sysize, sysize)
            X12 = zeros(ComplexF64, sysize, sysize)
            X21 = zeros(ComplexF64, sysize, sysize)
            X22 = zeros(ComplexF64, sysize, sysize)
            Y1 = zeros(ComplexF64, sysize, 1)
            Y2 = zeros(ComplexF64, sysize, 1)
            Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
            for i in xrange
                gdsopart = GDSO(t[i], b, Δ, sysize, Ωf, Ωi, J, D, P, Si, Sf, Bi, Bf)
                gtotal = GSV(t[i], b, sysize, Ωf, Ωi, J, D, Si, Sf, Bi, Bf, X11, X12, X21, X22, Y1, Y2, gdsopart, T,Hso)
                x[i] = gtotal
                open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
                    write(f, "ITERATION $i COMPLETED BY THREAD $(Threads.threadid()) \n")
                end
            end # loop
        end # thread work load
    end # loop over chunks, Julia will wait until all task are completed
    open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t, yr, linewidth=2, color="blue", title="Δ = $Δ", label="Real part", grid=false)
    p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
    savefig(p, "gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1] - t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p * signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558e-15
        F = F * 6.57e15
        time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
        p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
        savefig(p, "gt-fft.svg")
        ee = Δ# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ", abs.(F)[index])
        rc = abs.(F)[index]
    end
#################################################This part writes the output to a file#######################################################
    begin 
        open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f, "NUMBER OF CORES = $(Threads.nthreads())\n")
            write(f, "The value of Δ is $Δ\n")
            write(f, "Number of time points = $dd\n")
            write(f, "The rate constant calculated from Julia FFTW = $rc\n")
            write(f, "-------------------------\n")
            write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

        open("GT-VALUES-A1.txt", "w") do f
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

    end

end


timeneed = @elapsed calc(t, b, Delta, sysize, omegaf, omegai, J, D, P; nchunks=Threads.nthreads())
open("ISC-CODE-VERSION-A1-THREADED.TXT", "a") do f
   write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING GC + C + RC]=  $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end

The above code served it purpose, but now I am faced with the challenge of tackling systems where the variable sysize becomes very large, causing the nested loop within the function GSV to become extremely time consuming to the point that now I would benefit from the possibility of using multiple CPUs with each CPU having 40 cores. To the best of my knowledge, the Distrubuted package can be used to achieve the same. In this code, as you can see, the variables like X11 X12 etc have been used as workspaces and thus I have provided each thread with a copy of them and have told each thread to work with only a part of the array t using ChunkSplitters,jl.

Now, how do I convert this code into one that uses Distributed.jl instead and can thus be scaled up to multiple CPUs. Which variables do I need to declare as @everywhere ? Are there any better alternatives than using Distributed.jl ? And most importantly, how to rewrite the part that uses the Threads.@spawn macro and thereby supplies one copy of the workspace variables to each thread in the Distributed.jl regime. I am a complete novice when it comes to parallel computing and programming in general, so any help is highly appreciated ! If anyone wants to run the current working code (attached above) in their system, you may use the sample data files given here in this Google Drive link.

There are some tutorials online:

https://enccs.github.io/julia-for-hpc/distributed/

Start understanding those, before applying to your problem.

3 Likes

Thank you for the resources. I tried going through them to the best of my abilities. Fortunately, the only parallelization I need to achieve is in calculating the functions GSV and GDSO for different parameters independently of each other, thus as far as I understand it, the scenario of Data Movement does not arise. The first thing I have done is to remove all the implementation of Base.Threads to give this raw serial code that runs and gives the expected results below :

using Distributed
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
using Plots
using Plots.PlotMeasures
using FFTW
using SharedArrays
using ChunkSplitters
BLAS.set_num_threads(1)

println("Number of processes = $(Distributed.nprocs())")
open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
    write(f, "\n\n\n====================================================\n\n\n")
end

function GDSO(
    t::Float64,
    b::Float64,
    Δ::Float64,
    sysize::Int64,
    Ωf::Diagonal{Float64},
    Ωi::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    ρ::Diagonal{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64}
)
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
        Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
        Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
        Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
    end
    BB = (Bf + J' * Bi * J)
    AA = (Sf + J' * Si * J)

    W = [BB -AA
        -AA BB] #We wont calculate the 2Nx2N determinant directly.

    # Wapp = BB*(BB-  AA*(BB^-1)*AA  )
    Wapp = BB * (BB - AA * (BB \ AA))
    U = Bi - Si
    V = [J' * U * D
        J' * U * D]
    # F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
    # F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
    F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
    F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * U * D))
    g = F1 * exp(F2)
    Δ = Δ * 0.0367493  # in atomic units
    g = g * exp(im * Δ * t)
    η = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
    #g = g * exp(-η * abs(t))     # Adding the damping factor Lorrentzian
    g = g * exp(-η * t^2)     # Adding the damping factor Gaussian
    # println("t = $t was evaluated by process $(myid())")
    return g[1]
end

@views function GSV(
    t::Float64,
    b::Float64,
    sysize::Int64,
    Ωf::Diagonal{Float64},
    Ωi::Diagonal{Float64},
    J::Matrix{Float64},
    D::Matrix{Float64},
    Si::Diagonal{ComplexF64},
    Sf::Diagonal{ComplexF64},
    Bi::Diagonal{ComplexF64},
    Bf::Diagonal{ComplexF64},
    X11::Matrix{ComplexF64},
    X12::Matrix{ComplexF64},
    X21::Matrix{ComplexF64},
    X22::Matrix{ComplexF64},
    Y1::Matrix{ComplexF64},
    Y2::Matrix{ComplexF64},
    gdsopart::ComplexF64,
    T::Matrix{Float64},       ## Product if NAC and SO 
    Hso::Float64
)

    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t * 4.1341373335e16                     # converting time to atomic units
    tp = -im * b - t
    for i = 1:sysize
        Sf[i, i] = Ωf[i, i] / sin(Ωf[i, i] * t)
        Si[i, i] = Ωi[i, i] / sin(Ωi[i, i] * tp)
        Bf[i, i] = Ωf[i, i] / tan(Ωf[i, i] * t)
        Bi[i, i] = Ωi[i, i] / tan(Ωi[i, i] * tp)
    end
    W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
        -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
    U = Bi - Si
    V = [J' * U * D
        J' * U * D]

    ########## Defining the X and Y Matrices ##################################


    ################# Final Calculation and returning value ##################
    s = zero(ComplexF64)
    MJSJ = J' * Si * J
    MJBJ = J' * Bi * J
    MJUD = J' * U * D
    Win = inv(W)
    WinV = Win * V
    X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
    Y = zeros(ComplexF64, 2 * sysize, 1)
    XWin = similar(Win)
    for m in 1:sysize
        for mp in 1:sysize
            X11[m, :] .= (MJSJ)[mp, :] .* (-Bf[m, m])
            X12[m, :] .= (MJBJ)[mp, :] .* (Bf[m, m])
            X21[m, :] .= (MJSJ)[mp, :] .* (Sf[m, m])
            X22[m, :] .= (MJBJ)[mp, :] .* (-Sf[m, m])
            #            X = [X11 X12
            #                X21 X22]
            X[1:sysize, 1:sysize] .= X11
            X[1:sysize, sysize+1:2*sysize] .= X12
            X[sysize+1:2*sysize, 1:sysize] .= X21
            X[sysize+1:2*sysize, sysize+1:2*sysize] .= X22

            Y1[m, :] .= Bf[m, m] * (MJUD)[mp, :]
            Y2[m, :] .= -Sf[m, m] * (MJUD)[mp, :]
            Y[1:sysize, 1] .= Y1
            Y[sysize+1:2*sysize, 1] .= Y2
            g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
            s = s + g * gdsopart * T[m, mp] 
        end
        X11[m, 1:sysize] .= 0
        X12[m, 1:sysize] .= 0
        X21[m, 1:sysize] .= 0
        X22[m, 1:sysize] .= 0
        Y1[m, 1] = 0
        Y2[m, 1] = 0
    end
    s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
    return s
end

const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
const Delta = 0.11 # in eV
const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
const dd = 11                              # Change this to modify number of points
const Temp = 300
kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
const kT = kT_temp * 2.29371227840e17         # in atomic units
const b = 1 / kT
const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
const sysize = size(omegai)[1]
const P = @. 2 * sinh(b * omegai / 2)
T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
const D = readdlm("d.txt", Float64) # Displacement Matrix
const J = readdlm("j.txt", Float64) # Duchinsky Matrix
const t = collect(range(-5e-12, stop=5e-12, length=dd))
t[div(dd + 1, 2)] = 10e-25

function calc(t, b, Δ, sysize, Ωf, Ωi, J, D, P)
    # @sync tells Julia to wait for any tasks started inside it
    open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
        write(f, "STARTING LOOP WITH NUMBER OF PROCESSES = $(Distributed.nprocs()) \n====================================================\n")
    end
    x = zeros(ComplexF64, dd)
    Sf = Diagonal(zeros(ComplexF64, sysize))
    Si = Diagonal(zeros(ComplexF64, sysize))
    Bf = Diagonal(zeros(ComplexF64, sysize))
    Bi = Diagonal(zeros(ComplexF64, sysize))
    X11 = zeros(ComplexF64, sysize, sysize)
    X12 = zeros(ComplexF64, sysize, sysize)
    X21 = zeros(ComplexF64, sysize, sysize)
    X22 = zeros(ComplexF64, sysize, sysize)
    Y1 = zeros(ComplexF64, sysize, 1)
    Y2 = zeros(ComplexF64, sysize, 1)
    Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units
    for i in 1:dd
        gdsopart = GDSO(t[i], b, Δ, sysize, Ωf, Ωi, J, D, P, Si, Sf, Bi, Bf)
        gtotal = GSV(t[i], b, sysize, Ωf, Ωi, J, D, Si, Sf, Bi, Bf, X11, X12, X21, X22, Y1, Y2, gdsopart, T,Hso)
        x[i] = gtotal
        open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
            write(f, "ITERATION $i COMPLETED BY PROCESS $(Distributed.myid()) \n")
        end
    end
    open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t, yr, linewidth=2, color="blue", title="Δ = $Δ", label="Real part", grid=false)
    p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
    savefig(p, "gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1] - t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p * signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558e-15
        F = F * 6.57e15
        time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
        p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
        savefig(p, "gt-fft.svg")
        ee = Δ# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ", abs.(F)[index])
        rc = abs.(F)[index]
    end
#################################################This part writes the output to a file#######################################################
    begin 
        open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f, "NUMBER OF CORES = $(Distributed.nprocs())\n")
            write(f, "The value of Δ is $Δ\n")
            write(f, "Number of time points = $dd\n")
            write(f, "The rate constant calculated from Julia FFTW = $rc\n")
            write(f, "-------------------------\n")
            write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

        open("GT-VALUES-A1.txt", "w") do f
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

    end

end


timeneed = @elapsed calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
   write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING COMPILATION]=  $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end

Now as per the tutorials, two of the methods from Distributed.jl seem to be most relevant here, the @distributed macro and the pmap function. However it has been stated that

pmap is better when each iteration takes time and @distributed is better when each iteration is very fast

Here I am faced with the former situation as the computation of GSV is the most time consuming part. But the pmap function only supports cases like

x=pmap(f,range(..))

where f must be a function of only one variable i and the above line assigns x[i]=f(i) in a distributed way by automatically sending each chunk of the array x to the different workers (to the best of my interpretation of the tutorials). However, as you can see from the code, the part I need to parallelize requires the computation of

gdsopart = GDSO(t[i], b, Δ, sysize, Ωf, Ωi, J, D, P, Si, Sf, Bi, Bf)
gtotal = GSV(t[i], b, sysize, Ωf, Ωi, J, D, Si, Sf, Bi, Bf, X11, X12, X21, X22, Y1, Y2, gdsopart, T,Hso)
x[i] = gtotal

Which needs calling of functions which seem to be incompatible with the one pmap supports. One thing I thought of proceeds as below

using Distributed

@everywhere begin
    # Assuming all the required variables and functions are defined
    function compute_gtotal(i)
        gdsopart = GDSO(t[i], b, Δ, sysize, Ωf, Ωi, J, D, P, Si, Sf, Bi, Bf)
        gtotal = GSV(t[i], b, sysize, Ωf, Ωi, J, D, Si, Sf, Bi, Bf, X11, X12, X21, X22, Y1, Y2, gdsopart, T,Hso)
        open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
            write(f, "ITERATION $i COMPLETED BY PROCESS $(Distributed.myid()) \n")
        end
        return gtotal
    end
end

x = pmap(compute_gtotal, 1:dd)

where I create a new function compute_gtotal which is of the appropriate nature to be used with pmap and put the function calls to GDSO and GSV and writing to text files within that. But wouldn’t this decrease the performance signifcantly since it needs access to variables like Ωf etc, which would then be accessed as global variables (as they cannot be passed into the compute_global function as parameters, that would again cause the function to become incompatible with pmap).

So, what can be done to achieve the required level of parallelization keeping in mind that the end-goal is to run the code on large servers utilizing multiple CPUs. (I understand this might eventually require getting familiar with things like ClusterManagers.jl , but as far as I understand it, those come after we have done a successful implementation of Distributed.jl elements into the code). Thank you for your patience.

Working upon the previous reply I made, I have tried to implement pmap and have successfully gotten another working version of the code as given below :

using Distributed
@everywhere begin
    using MKL
    using DelimitedFiles
    using LinearAlgebra
    using LaTeXStrings
    using TimerOutputs
    using Plots
    using Plots.PlotMeasures
    using FFTW
    BLAS.set_num_threads(1)
end

println("Number of processes = $(Distributed.nprocs())")
open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
    write(f, "\n\n\n====================================================\n\n\n")
end





@everywhere begin
    const hso_s1t3 = 1.49 * 0.0000046  ## SOC between S1 and T3 = 0.48 cm-1, converting to atomic units
    const Delta = 0.11 # in eV
    const Delta_s1t3 = 0.091 * 0.037 # Difference between S1 and T3 = 0.091 eV in atomic units
    const dd = 11                              # Change this to modify number of points
    const Temp = 300
    kT_temp = Temp * 1.380649e-23  # Boltzmann constant times temperature in Joules
    const kT = kT_temp * 2.29371227840e17         # in atomic units
    const b = 1 / kT
    const omegaf = Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))) * 4.55633e-6   # Final state  frequencies in atomic units
    const omegai = Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))) * 4.55633e-6  # Initial state  frequencies in atomic units
    const sysize = size(omegai)[1]
    const P = @. 2 * sinh(b * omegai / 2)
    T_temp = readdlm("nac.txt", '\t', Float64, '\n') # NAC matrix
    const T = (T_temp * T_temp')* (hso_s1t3^2/Delta_s1t3^2)
    const D = readdlm("d.txt", Float64) # Displacement Matrix
    const J = readdlm("j.txt", Float64) # Duchinsky Matrix
    const t = collect(range(-5e-12, stop=5e-12, length=dd))
    t[div(dd + 1, 2)] = 10e-25
    const Si = Diagonal(zeros(ComplexF64, sysize))
    const Sf = Diagonal(zeros(ComplexF64, sysize))
    const Bf = Diagonal(zeros(ComplexF64, sysize))
    const Bi = Diagonal(zeros(ComplexF64, sysize))
    const X11 = zeros(ComplexF64, sysize, sysize)
    const X12 = zeros(ComplexF64, sysize, sysize)
    const X21 = zeros(ComplexF64, sysize, sysize)
    const X22 = zeros(ComplexF64, sysize, sysize)
    const Y1 = zeros(ComplexF64, sysize, 1)
    const Y2 = zeros(ComplexF64, sysize, 1)
    const Hso = 0.48 * 0.0000046  ## SOC between S1 and T2 = 0.48 cm-1, converting to atomic units

    function GDSO(
        t::Float64,
        b::Float64,
        Delta::Float64,
        sysize::Int64,
        omegaf::Diagonal{Float64},
        omegai::Diagonal{Float64},
        J::Matrix{Float64},
        D::Matrix{Float64},
        ρ::Diagonal{Float64},
        Si::Diagonal{ComplexF64},
        Sf::Diagonal{ComplexF64},
        Bi::Diagonal{ComplexF64},
        Bf::Diagonal{ComplexF64}
    )
        t = t * 4.1341373335e16                     # converting time to atomic units
        tp = -im * b - t
        for i = 1:sysize
            Sf[i, i] = omegaf[i, i] / sin(omegaf[i, i] * t)
            Si[i, i] = omegai[i, i] / sin(omegai[i, i] * tp)
            Bf[i, i] = omegaf[i, i] / tan(omegaf[i, i] * t)
            Bi[i, i] = omegai[i, i] / tan(omegai[i, i] * tp)
        end
        BB = (Bf + J' * Bi * J)
        AA = (Sf + J' * Si * J)
    
        W = [BB -AA
            -AA BB] #We wont calculate the 2Nx2N determinant directly.
    
        # Wapp = BB*(BB-  AA*(BB^-1)*AA  )
        Wapp = BB * (BB - AA * (BB \ AA))
        U = Bi - Si
        V = [J' * U * D
            J' * U * D]
        # F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
        # F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
        F1 = sqrt(det(Si * Sf * (Wapp \ ρ^2)))
        F2 = (-(im / 2) * (transpose(V) * (W \ V))) + ((im) * (transpose(D) * U * D))
        g = F1 * exp(F2)
        Delta = Delta * 0.0367493  # in atomic units
        g = g * exp(im * Delta * t)
        η = 2 * 4.55633e-6  # 10 cm^-1 to hatree energy
        #g = g * exp(-η * abs(t))     # Adding the damping factor Lorrentzian
        g = g * exp(-η * t^2)     # Adding the damping factor Gaussian
        # println("t = $t was evaluated by process $(myid())")
        return g[1]
    end
    
    @views function GSV(
        t::Float64,
        b::Float64,
        sysize::Int64,
        omegaf::Diagonal{Float64},
        omegai::Diagonal{Float64},
        J::Matrix{Float64},
        D::Matrix{Float64},
        Si::Diagonal{ComplexF64},
        Sf::Diagonal{ComplexF64},
        Bi::Diagonal{ComplexF64},
        Bf::Diagonal{ComplexF64},
        X11::Matrix{ComplexF64},
        X12::Matrix{ComplexF64},
        X21::Matrix{ComplexF64},
        X22::Matrix{ComplexF64},
        Y1::Matrix{ComplexF64},
        Y2::Matrix{ComplexF64},
        gdsopart::ComplexF64,
        T::Matrix{Float64},       ## Product if NAC and SO 
        Hso::Float64
    )
    
        ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
        t = t * 4.1341373335e16                     # converting time to atomic units
        tp = -im * b - t
        for i = 1:sysize
            Sf[i, i] = omegaf[i, i] / sin(omegaf[i, i] * t)
            Si[i, i] = omegai[i, i] / sin(omegai[i, i] * tp)
            Bf[i, i] = omegaf[i, i] / tan(omegaf[i, i] * t)
            Bi[i, i] = omegai[i, i] / tan(omegai[i, i] * tp)
        end
        W = [(Bf+J'*Bi*J) -(Sf + J' * Si * J)
            -(Sf + J' * Si * J) (Bf+J'*Bi*J)]
        U = Bi - Si
        V = [J' * U * D
            J' * U * D]
    
        ########## Defining the X and Y Matrices ##################################
    
    
        ################# Final Calculation and returning value ##################
        s = zero(ComplexF64)
        MJSJ = J' * Si * J
        MJBJ = J' * Bi * J
        MJUD = J' * U * D
        Win = inv(W)
        WinV = Win * V
        X = zeros(ComplexF64, 2 * sysize, 2 * sysize)
        Y = zeros(ComplexF64, 2 * sysize, 1)
        XWin = similar(Win)
        for m in 1:sysize
            for mp in 1:sysize
                X11[m, :] .= (MJSJ)[mp, :] .* (-Bf[m, m])
                X12[m, :] .= (MJBJ)[mp, :] .* (Bf[m, m])
                X21[m, :] .= (MJSJ)[mp, :] .* (Sf[m, m])
                X22[m, :] .= (MJBJ)[mp, :] .* (-Sf[m, m])
                #            X = [X11 X12
                #                X21 X22]
                X[1:sysize, 1:sysize] .= X11
                X[1:sysize, sysize+1:2*sysize] .= X12
                X[sysize+1:2*sysize, 1:sysize] .= X21
                X[sysize+1:2*sysize, sysize+1:2*sysize] .= X22
    
                Y1[m, :] .= Bf[m, m] * (MJUD)[mp, :]
                Y2[m, :] .= -Sf[m, m] * (MJUD)[mp, :]
                Y[1:sysize, 1] .= Y1
                Y[sysize+1:2*sysize, 1] .= Y2
                g = im * (tr(mul!(XWin, X, Win))  + (transpose(WinV)*X*WinV)[1] - (transpose(Y)*WinV)[1])
                s = s + g * gdsopart * T[m, mp] 
            end
            X11[m, 1:sysize] .= 0
            X12[m, 1:sysize] .= 0
            X21[m, 1:sysize] .= 0
            X22[m, 1:sysize] .= 0
            Y1[m, 1] = 0
            Y2[m, 1] = 0
        end
        s = s + abs(Hso)^2 * gdsopart   #### LATEST MODIFICATION
        return s
    end


    function compute_gtotal(i)
        gdsopart = GDSO(t[i], b, Delta, sysize, omegaf, omegai, J, D, P, Si, Sf, Bi, Bf)
        gtotal = GSV(t[i], b, sysize, omegaf, omegai, J, D, Si, Sf, Bi, Bf, X11, X12, X21, X22, Y1, Y2, gdsopart, T,Hso)
        open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
            write(f, "ITERATION $i COMPLETED BY PROCESS $(Distributed.myid()) \n")
        end
        return gtotal
    end
end

function calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
    # @sync tells Julia to wait for any tasks started inside it
    open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
        write(f, "STARTING LOOP WITH NUMBER OF PROCESSES = $(Distributed.nprocs()) \n====================================================\n")
    end
    x = zeros(ComplexF64, dd)
    @time x = pmap(compute_gtotal, 1:dd)
    open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t, yr, linewidth=2, color="blue", title="Delta = $Delta", label="Real part", grid=false)
    p = plot!(t, yi, linewidth=2, color="red", label="Imaginary part", size=(1000, 800), xlims=(-30e-15, 30e-15))
    savefig(p, "gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1] - t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p * signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558e-15
        F = F * 6.57e15
        time_domain = plot(t, signal, title=L"G(t)", legend=false, color="orange", xlabel=L"\mathrm{Time}\ (s)", ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title=L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}", legend=false, xlabel=L"\mathrm{Energy\ (eV)}", ylabel=L"\mathrm{k_{IC}\ (s^{-1})}", xlims=(0, 4))
        p = plot(time_domain, freq_domain, layout=(2, 1), size=(1000, 800), right_margin=10mm)
        savefig(p, "gt-fft.svg")
        ee = Delta# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ", abs.(F)[index])
        rc = abs.(F)[index]
    end
#################################################This part writes the output to a file#######################################################
    begin 
        open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f, "NUMBER OF CORES = $(Distributed.nprocs())\n")
            write(f, "The value of Delta is $Delta\n")
            write(f, "Number of time points = $dd\n")
            write(f, "The rate constant calculated from Julia FFTW = $rc\n")
            write(f, "-------------------------\n")
            write(f, "The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

        open("GT-VALUES-A1.txt", "w") do f
            for i in 1:dd
                write(f, "$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end

    end

end


timeneed = @elapsed calc(t, b, Delta, sysize, omegaf, omegai, J, D, P)
open("ISC-CODE-VERSION-A1-DISTRIBUTED.TXT", "a") do f
   write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING COMPILATION]=  $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end

As you can see, I had to declare the common workspaces as const under the @everywhere block to make sure every process has access to a copy of them. The runs and produces the expected output with :

Number of processes = 12
 15.994364 seconds (504.77 k allocations: 34.101 MiB, 3.17% compilation time)
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18966679346

The code is a bit slower than the version using Base.Threads which gave us

Number of threads = 11
 10.459924 seconds (8.79 M allocations: 791.078 MiB, 1.51% gc time, 498.83% compilation time)
GKS: Possible loss of precision in routine SET_WINDOW
The rate constant is 42009.18966679346

As you can see, the Distributed.jl version is quite a bit slower, I am guessing this is because of declaring the workspace variables in the global scope. All that said, now I have two main questions

  1. How do I improve the performance of the Distributed.jl version to get it closer (or faster ?) than the threaded version or whether that is something that is even possible to attain.

  2. Given that now I have a working version of the code that uses Distributed.jl instead of threads (although not an ideally fast one), what should I do to run this on our HPC Cluster which uses

Slurm-22.05.09 (open source) as a workload manager

I have not really worked with such clusters directly before so I am unaware how to use Slurm along with the Distributed.jl version of my code to attain a multi-cpu level of parallelization. Any help / links to resources that tackle this issue is greatly appreciated !