I am trying to write a code that involves running 3 nested loops, and calling a computationally expensive function within the loop. The expense mainly comes from defining , multiplying and inverting large [~100x100] matrices. I am defining the variables that will be repeatedly needed in the global scope as follows
Temp=300
kT = Temp * 1.380649 * 10^-23 # Boltzmann constant times temperature in Joules
kT= kT * 2.29371227840*10^17 # in atomic units
b = 1/kT
Ωf = readdlm("wf.txt", '\t', Float64, '\n') # Final state frequencies
Ωi = readdlm("wi.txt", '\t', Float64, '\n') # Initial state frequencies
J = readdlm("j.txt", Float64); # J matrix
D = readdlm("d.txt", Float64); # Duchinsky matrix
Ωf = Matrix(Diagonal(vec(Ωf)))
Ωi = Matrix(Diagonal(vec(Ωi)))
sysize = size(Ωi)[1]
Ωf = Ωf.*4.55633*10^-6 #Converting to cm^-1 atomic energy units, i.e hatree
Ωi = Ωi.*4.55633*10^-6 ; #Same as above
ρ = zeros(Float64,sysize,sysize)
for i in range(1,sysize)
ρ[i,i] = (2*sinh(b*Ωi[i,i]/2))
end
T = readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
T = T*T';
Sf = zeros(ComplexF64,sysize,sysize)
Si = zeros(ComplexF64,sysize,sysize)
Bf = zeros(ComplexF64,sysize,sysize)
Bi = zeros(ComplexF64,sysize,sysize)
after that I define two of my functions
function GFC(t::Float64,T::Float64,Δ::Float64,sysize::Int64,Ωf::Matrix{Float64},Ωi::Matrix{Float64},J::Matrix{Float64},D::Matrix{Float64})
t = t* 4.1341373335*10^16 # converting time to atomic units
tp = - im*b - t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
BB = (Bf+J'*Bi*J)
AA = (Sf+J'*Si*J)
W = [BB -AA
-AA BB] #We wont calculate the 2Nx2N determinant directly.
Wapp = BB*(BB- AA*(BB^-1)*AA )
U = Bi-Si
V = [J'*U*D
J'*U*D]
F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
F2 = (-(im/2)*(V'*W^-1*V)) + ((im)*(D'*U*D))
g = F1*exp(F2)
return g
and another function GHT
that has similar computational complexity. I am calling them inside a loop as follows:
function calc()
@sync @distributed for i in range(1,dd)
s=0
if t[i]==0
t[i]=10^-35
end
gfcpart=GFC(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D)[1]
for m in range(1,sysize)
for mp in range(1,sysize)
s=s+ (GHT(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D,m,mp)[1]*gfcpart*T[m,mp])
end
x[i]=s
end
# x[i] = (GHT(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D,1,30))[1]*GFC(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D)[1]*T[1,1]*det(ρ)
end
@sync @distributed for i in range(1,dd)
yr[i] = real(x[i])
yi[i] = imag(x[i])
end
p = plot(t,yr,linewidth = 2,color = "blue",title = "Δ = $Δ",label = "Real part",grid=false)
p =plot!(t,yi,linewidth = 2,color = "red",label = "Imaginary part",size=(1000,800),xlims=(-30*10^-15,30*10^-15))
savefig(p,"gt.svg")
end
Where I have tried to distribute the first loop among different processors in hope of gaining some performance. I have SharedArray
for variables like x,yr,yi that might be used by all the different processors. I have checked that the distributed code is giving concurrent results as the serial code, so I know there isn’t any logical fallacy within the parallelization I have implemented.
The problem The code is scaling horribly as the matrices get larger. And the parallelization is not giving as much of a performance boost as expected. For example, when my the dimension of matrices was 30
the code took 778
seconds for the calc()
to complete using dd=1000
points in the upper loop when running serially. When using 10
processors, it took 178
seconds instead. Is there any way to attain better scaling by fixing any inefficiencies in the code that might affect both the serial and parallel computations ?