How to improve performance in a function that repeatedly defines and multiplies matrices

You can check using the @code_warntype calc() macro but its output can be a bit overwhelming especially with such long functions. If you see red, that means the compiler cannot infer the type and a function barrier will help performance.

I tried running that, as expected it gave an overwhelming amount of output. But I was able to decipher that I have not explicitly mentioned the type of most variables like Ωi etc. So I explicityly typed them as follows

function calc()
    @everywhere begin
        Δ::Float64 = 2.02 # in eV
        dd::Int64 = 11                               # Change this to modify number of points
        Temp::Int64=300
        kT = Temp * 1.380649 * 10^-23  # Boltzmann constant times temperature in Joules
        kT= kT * 2.29371227840*10^17         # in atomic units
        b ::Float64= 1/kT
        Ωf ::Matrix{Float64}= Matrix(Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n'))))*4.55633*10^-6   # Final state  frequencies in atomic units
        Ωi ::Matrix{Float64}= Matrix(Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n'))))*4.55633*10^-6  # Initial state  frequencies in atomic units
        sysize ::Int64 = size(Ωi)[1]
        P = zeros(Float64,sysize,sysize)
        for i in range(1,sysize)
            P[i,i] = (2*sinh(b*Ωi[i,i]/2))
        end
        T ::Matrix{Float64}= readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
        T = T*T';
        D ::Matrix{Float64}= readdlm("d.txt", Float64); # Displacement Matrix
        J ::Matrix{Float64}= readdlm("j.txt", Float64); # Duchinsky Matrix
        Sf :: Matrix{ComplexF64}= zeros(ComplexF64,sysize,sysize)
        Si :: Matrix{ComplexF64}= zeros(ComplexF64,sysize,sysize)
        Bf :: Matrix{ComplexF64}= zeros(ComplexF64,sysize,sysize)
        Bi :: Matrix{ComplexF64}= zeros(ComplexF64,sysize,sysize)
        X11 ::Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
        X12 ::Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
        X21 ::Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
        X22 ::Matrix{ComplexF64} = zeros(ComplexF64,sysize,sysize)
        Y1 :: Matrix{ComplexF64}= zeros(ComplexF64,sysize,1)
        Y2 :: Matrix{ComplexF64}= zeros(ComplexF64,sysize,1)
    end
    t ::Vector{Float64}=  collect(range(-5*10^-12,stop=5*10^-12,length=dd))
    t[Int64((dd+1)/2)] = 10e-25
    x =  SharedArray(zeros(ComplexF64,dd))
    yr = SharedArray( zeros(Float64,dd))
    yi = SharedArray( zeros(Float64,dd))

    ################## Nested Loop Over Time ###################################
    @time @sync @distributed for i in range(1,dd)
        gfcpart=GFC(t[i],b,Δ,sysize,Ωf,Ωi,J,D,P,Si,Sf,Bi,Bf)[1]
        gtotal = GHT(t[i],b,sysize,Ωf,Ωi,J,D,Si,Sf,Bi,Bf,X11,X12,X21,X22,Y1,Y2,gfcpart,T)
        x[i]=gtotal
        println("ITERATION $i COMPLETED BY PROCESS $(myid())")
    end
    for i in range(1,dd)
        yr[i] = real(x[i])
        yi[i] = imag(x[i])
    end

As you can see, I also employed the advice of not using any global variables here by declaring all the variables within the calc() function and then passing them as parameters, I modified the definitions of GFC and GHT accordingly so that they reuse the memory rather than allocating memory on every iteration. To my surprise, the time taken increased from 39 seconds to 50 seconds ! (I excluded the compilation time by runnning the function twice).