I tried to incorporate the advice you have on the other thread using a naive port of your model code into my program as follows
println("Number of threads = $(Threads.nthreads())")
begin
using LoopVectorization
using MKL
using DelimitedFiles
using LinearAlgebra
using LaTeXStrings
using TimerOutputs
BLAS.set_num_threads(1)
using Plots
using Plots.PlotMeasures
using FFTW
using Base.Threads
using SharedArrays
using ChunkSplitters
end
function GFC(t::Float64,b::Float64,Δ::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},ρ::Diagonal{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64})
t = t* 4.1341373335*10^16 # converting time to atomic units
tp = - im*b - t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
BB = (Bf+J'*Bi*J)
AA = (Sf+J'*Si*J)
W = [BB -AA
-AA BB] #We wont calculate the 2Nx2N determinant directly.
# Wapp = BB*(BB- AA*(BB^-1)*AA )
Wapp = BB*(BB- AA*(BB\AA) )
U = Bi-Si
V = [J'*U*D
J'*U*D]
# F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
# F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
F1 = sqrt(det(Si*Sf* (Wapp\ρ^2)))
F2 = (-(im/2)*(transpose(V)* (W\V))) + ((im)*(transpose(D)*U*D))
g = F1*exp(F2)
Δ = Δ * 0.0367493 # in atomic units
g = g * exp(im * Δ * t)
η = 2 * 4.55633 * 10^-6 # 10 cm^-1 to hatree energy
#g = g * exp(-η * abs(t)) # Adding the damping factor Lorrentzian
g = g * exp(-η * t^2) # Adding the damping factor Gaussian
# println("t = $t was evaluated by process $(myid())")
# println("GFC at t = $t is $g")
# println("F1 at t = $t = $F1")
return g[1]
end
function GHT(t::Float64,b::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64},X11::Matrix{ComplexF64},X12::Matrix{ComplexF64},X21::Matrix{ComplexF64},X22::Matrix{ComplexF64},Y1::Matrix{ComplexF64},Y2::Matrix{ComplexF64},gfcpart::ComplexF64,T::Matrix{Float64})
################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
t = t* 4.1341373335*10^16 # converting time to atomic units
tp = - im*b - t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
W = [(Bf+J'*Bi*J) -(Sf+J'*Si*J)
-(Sf+J'*Si*J) (Bf+J'*Bi*J)]
U = Bi-Si
V = [J'*U*D
J'*U*D]
########## Defining the X and Y Matrices ##################################
################# Final Calculation and returning value ##################
s=0
MJSJ = J'*Si*J
MJBJ = J'*Bi*J
MJUD = J'*U*D
for m in range(1,sysize)
for mp in range(1,sysize)
# @. X11 = 0.0
# @. X12 = 0.0
# @. X21 = 0.0
# @. X22 = 0.0
X11[m,:]=(MJSJ)[mp,:]*(-Bf[m,m])
X12[m,:]=(MJBJ)[mp,:]*(Bf[m,m])
X21[m,:]=(MJSJ)[mp,:]*(Sf[m,m])
X22[m,:]=(MJBJ)[mp,:]*(-Sf[m,m])
X = [X11 X12
X21 X22]
# @. Y1 = 0.0
# @. Y2 = 0.0
Y1[m,:] = Bf[m,m]*(MJUD)[mp,:]
Y2[m,:] = -Sf[m,m]*(MJUD)[mp,:]
Y = [Y1
Y2]
g = im*tr(W\X)+ (transpose(W\V)*X*(W\V))[1] - (transpose(Y)*(W\V))[1]
s = s+g[1]*gfcpart*T[m,mp]
end
for i in 1:sysize
X11[m,i]=0
X12[m,i]=0
X21[m,i]=0
X22[m,i]=0
Y1[m,1]=0
Y2[m,1]=0
end
end
return s[1]
end
begin ## Define all constant variables
const Delta::Float64 = 2.02 # in eV
const dd::Int64 = 11 # Change this to modify number of points
const Temp::Int64=300
kT = Temp * 1.380649 * 10^-23 # Boltzmann constant times temperature in Joules
kT= kT * 2.29371227840*10^17 # in atomic units
const b ::Float64= 1/kT
const omegaf ::Diagonal{Float64}= Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n')))*4.55633*10^-6 # Final state frequencies in atomic units
const omegai ::Diagonal{Float64}= Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n')))*4.55633*10^-6 # Initial state frequencies in atomic units
const sysize ::Int64 = size(omegai)[1]
const P = @. 2*sinh(b*omegai/2)
T ::Matrix{Float64}= readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
T = T*T';
const D ::Matrix{Float64}= readdlm("d.txt", Float64); # Displacement Matrix
const J ::Matrix{Float64}= readdlm("j.txt", Float64); # Duchinsky Matrix
t ::Vector{Float64}= collect(range(-5*10^-12,stop=5*10^-12,length=dd))
t[div(dd+1,2)] = 10e-25
end
function calc(t,b,Δ,sysize,Ωf,Ωi,J,D,P)
nchunks = Threads.nthreads() # define number of chunks e.g. equal to number of threads
# @sync tells Julia to wait for any tasks started inside it
open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "STARTING LOOP WITH NUMBER OF THREADS = $(Threads.nthreads()) \n====================================================\n")
end
@sync for (xrange, ichunk) in chunks(range(1,dd), nchunks)
# xrange are the indices this chunk contains, ichunk is just the index of the chunk, which we don't need in this scenario
Threads.@spawn begin # this line tells Julia to start a new thread with the contents of the begin block
Sf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Si ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Bf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
Bi ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
X11 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X12 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X21 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
X22 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
Y1 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,1)
Y2 :: Matrix{ComplexF64} = zeros(ComplexF64,sysize,1)
x = zeros(ComplexF64,dd)
for i in xrange
@time for i in 1:dd
gfcpart = GFC(t[i],b,Δ,sysize,Ωf,Ωi,J,D,P,Si,Sf,Bi,Bf)
gtotal = GHT(t[i],b,sysize,Ωf,Ωi,J,D,Si,Sf,Bi,Bf,X11,X12,X21,X22,Y1,Y2,gfcpart,T)
x[i] = gtotal
open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "ITERATION $i COMPLETED BY PROCESS $(Threads.threadid()) \n")
end
end
end # loop
end # thread work load
end # loop over chunks, Julia will wait until all task are completed
open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "LOOP COMPLETE \n====================================================")
end
yr = real.(x)
yi = imag.(x)
p = plot(t,yr,linewidth = 2,color = "blue",title = "Δ = $Δ",label = "Real part",grid=false)
p =plot!(t,yi,linewidth = 2,color = "red",label = "Imaginary part",size=(1000,800),xlims=(-30*10^-15,30*10^-15))
savefig(p,"gt.svg")
begin # This part does FFTW and finds final rate constant
fs = abs(t[1]-t[2])^-1
signal = yr
p = plan_fft(signal)
F = fftshift(p*signal)
freqs = fftshift(fftfreq(length(t), fs)) # X axis has frequencies in s^-1
#Covert freqs to eV
freqs = freqs * 4.13558 * 10^-15
F = F * 6.57 * 10^15
time_domain = plot(t, signal, title = L"G(t)",legend=false,color="orange",xlabel=L"\mathrm{Time}\ (s)",ylabel=L"\mathrm{A.u}")
freq_domain = plot(freqs, abs.(F), title = L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}",legend=false,xlabel=L"\mathrm{Energy\ (eV)}",ylabel=L"\mathrm{k_{IC}\ (s^{-1})}",xlims=(0,4))
p = plot(time_domain, freq_domain, layout = (2,1), size = (1000, 800),right_margin = 10mm)
savefig(p,"gt-fft.svg")
ee = Δ# * 8065.73 #Converting eV to cm^-1
index = findmin(abs.(freqs .- ee))[2] # Find the index of freq closest to ee
freqs[index]
println("The rate constant is ",abs.(F)[index])
rc = abs.(F)[index]
end
begin # This part writes the output to a file
open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "-------------------------\n")
write(f,"NUMBER OF CORES = $(Threads.nthreads())\n")
write(f,"The value of Δ is $Δ\n")
write(f,"Number of time points = $dd\n")
write(f, "The rate constant is $rc\n")
write(f,"-------------------------\n")
write(f,"The values of t,Re(g(t)),Im(g(t)) are \n\n")
for i in range(1,dd)
write(f,"$(t[i]) $(yr[i]) $(yi[i])\n")
end
end
end
end
timeneed = @elapsed calc(t,b,Delta,sysize,omegaf,omegai,J,D,P)
open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING GC + C + RC]= $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end
I tried running it with julia -t 1
but I noticed that it seems to be going on to an infinite loop, the output (directly generated from the above code is as follow)
STARTING LOOP WITH NUMBER OF THREADS = 1
====================================================
ITERATION 1 COMPLETED BY PROCESS 1
ITERATION 2 COMPLETED BY PROCESS 1
ITERATION 3 COMPLETED BY PROCESS 1
ITERATION 4 COMPLETED BY PROCESS 1
ITERATION 5 COMPLETED BY PROCESS 1
ITERATION 6 COMPLETED BY PROCESS 1
ITERATION 7 COMPLETED BY PROCESS 1
ITERATION 8 COMPLETED BY PROCESS 1
ITERATION 9 COMPLETED BY PROCESS 1
ITERATION 10 COMPLETED BY PROCESS 1
ITERATION 11 COMPLETED BY PROCESS 1
ITERATION 1 COMPLETED BY PROCESS 1
ITERATION 2 COMPLETED BY PROCESS 1
ITERATION 3 COMPLETED BY PROCESS 1
ITERATION 4 COMPLETED BY PROCESS 1
ITERATION 5 COMPLETED BY PROCESS 1
After completion of 11
iterations it again starts from the first when I set the number of threads = 1. I also do not understand the purpose (or lack theirof) of the variable ichunk