I am trying to write a code that involves running 3 nested loops, and calling a computationally expensive function within the loop. The expense mainly comes from defining , multiplying and inverting large [~100x100] matrices. I am defining the variables that will be repeatedly needed in the global scope as follows

```
Temp=300
kT = Temp * 1.380649 * 10^-23 # Boltzmann constant times temperature in Joules
kT= kT * 2.29371227840*10^17 # in atomic units
b = 1/kT
Ωf = readdlm("wf.txt", '\t', Float64, '\n') # Final state frequencies
Ωi = readdlm("wi.txt", '\t', Float64, '\n') # Initial state frequencies
J = readdlm("j.txt", Float64); # J matrix
D = readdlm("d.txt", Float64); # Duchinsky matrix
Ωf = Matrix(Diagonal(vec(Ωf)))
Ωi = Matrix(Diagonal(vec(Ωi)))
sysize = size(Ωi)[1]
Ωf = Ωf.*4.55633*10^-6 #Converting to cm^-1 atomic energy units, i.e hatree
Ωi = Ωi.*4.55633*10^-6 ; #Same as above
ρ = zeros(Float64,sysize,sysize)
for i in range(1,sysize)
ρ[i,i] = (2*sinh(b*Ωi[i,i]/2))
end
T = readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
T = T*T';
Sf = zeros(ComplexF64,sysize,sysize)
Si = zeros(ComplexF64,sysize,sysize)
Bf = zeros(ComplexF64,sysize,sysize)
Bi = zeros(ComplexF64,sysize,sysize)
```

after that I define two of my functions

```
function GFC(t::Float64,T::Float64,Δ::Float64,sysize::Int64,Ωf::Matrix{Float64},Ωi::Matrix{Float64},J::Matrix{Float64},D::Matrix{Float64})
t = t* 4.1341373335*10^16 # converting time to atomic units
tp = - im*b - t
for i = 1:sysize
Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
end
BB = (Bf+J'*Bi*J)
AA = (Sf+J'*Si*J)
W = [BB -AA
-AA BB] #We wont calculate the 2Nx2N determinant directly.
Wapp = BB*(BB- AA*(BB^-1)*AA )
U = Bi-Si
V = [J'*U*D
J'*U*D]
F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
F2 = (-(im/2)*(V'*W^-1*V)) + ((im)*(D'*U*D))
g = F1*exp(F2)
return g
```

and another function `GHT`

that has similar computational complexity. I am calling them inside a loop as follows:

```
function calc()
@sync @distributed for i in range(1,dd)
s=0
if t[i]==0
t[i]=10^-35
end
gfcpart=GFC(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D)[1]
for m in range(1,sysize)
for mp in range(1,sysize)
s=s+ (GHT(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D,m,mp)[1]*gfcpart*T[m,mp])
end
x[i]=s
end
# x[i] = (GHT(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D,1,30))[1]*GFC(t[i],300.0,Δ,sysize,Ωf,Ωi,J,D)[1]*T[1,1]*det(ρ)
end
@sync @distributed for i in range(1,dd)
yr[i] = real(x[i])
yi[i] = imag(x[i])
end
p = plot(t,yr,linewidth = 2,color = "blue",title = "Δ = $Δ",label = "Real part",grid=false)
p =plot!(t,yi,linewidth = 2,color = "red",label = "Imaginary part",size=(1000,800),xlims=(-30*10^-15,30*10^-15))
savefig(p,"gt.svg")
end
```

Where I have tried to distribute the first loop among different processors in hope of gaining some performance. I have `SharedArray`

for variables like x,yr,yi that might be used by all the different processors. I have checked that the distributed code is giving concurrent results as the serial code, so I know there isn’t any logical fallacy within the parallelization I have implemented.

**The problem** The code is scaling horribly as the matrices get larger. And the parallelization is not giving as much of a performance boost as expected. For example, when my the dimension of matrices was `30`

the code took `778`

seconds for the `calc()`

to complete using `dd=1000`

points in the upper loop when running serially. When using `10`

processors, it took `178`

seconds instead. Is there any way to attain better scaling by fixing any inefficiencies in the code that might affect both the serial and parallel computations ?