How to improve performance in a function that repeatedly defines and multiplies matrices

I tried to incorporate the advice you have on the other thread using a naive port of your model code into my program as follows

println("Number of threads = $(Threads.nthreads())")
begin
    using LoopVectorization
    using MKL
    using DelimitedFiles
    using LinearAlgebra
    using LaTeXStrings
    using TimerOutputs
    BLAS.set_num_threads(1)
    using Plots
    using Plots.PlotMeasures
    using FFTW
    using Base.Threads
    using SharedArrays
    using ChunkSplitters
end
function GFC(t::Float64,b::Float64,Δ::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},ρ::Diagonal{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64})
    t = t* 4.1341373335*10^16                     # converting time to atomic units
    tp = - im*b - t
    for i = 1:sysize
        Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
        Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
        Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
        Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
    end
    BB = (Bf+J'*Bi*J)
    AA = (Sf+J'*Si*J)

    W = [BB  -AA
        -AA  BB] #We wont calculate the 2Nx2N determinant directly.
    
    # Wapp = BB*(BB-  AA*(BB^-1)*AA  )
    Wapp = BB*(BB-  AA*(BB\AA)  )
    U = Bi-Si
    V = [J'*U*D
        J'*U*D]
    # F1 = sqrt(det(Si*Sf*ρ^2*Wapp^-1))
    # F1 = sqrt(det(Si*Sf*ρ^2)/det(Wapp))
    F1 = sqrt(det(Si*Sf*  (Wapp\ρ^2)))
    F2 = (-(im/2)*(transpose(V)* (W\V)))    +  ((im)*(transpose(D)*U*D))
    g = F1*exp(F2)
    Δ = Δ * 0.0367493  # in atomic units
    g = g * exp(im * Δ * t)
    η = 2 * 4.55633 * 10^-6  # 10 cm^-1 to hatree energy
    #g = g * exp(-η * abs(t))     # Adding the damping factor Lorrentzian
    g = g * exp(-η * t^2)     # Adding the damping factor Gaussian
    # println("t = $t was evaluated by process $(myid())")
    # println("GFC at t = $t   is     $g")
    # println("F1 at t = $t         = $F1")
    return g[1]
end
function GHT(t::Float64,b::Float64,sysize::Int64,Ωf::Diagonal{Float64},Ωi::Diagonal{Float64},J::Matrix{Float64},D::Matrix{Float64},Si::Diagonal{ComplexF64},Sf::Diagonal{ComplexF64},Bi::Diagonal{ComplexF64},Bf::Diagonal{ComplexF64},X11::Matrix{ComplexF64},X12::Matrix{ComplexF64},X21::Matrix{ComplexF64},X22::Matrix{ComplexF64},Y1::Matrix{ComplexF64},Y2::Matrix{ComplexF64},gfcpart::ComplexF64,T::Matrix{Float64})




    ################################## Preliminary Part , Common for both Frank Condon and Herzberg Teller ##############################
    t = t* 4.1341373335*10^16                     # converting time to atomic units
    tp = - im*b - t
    for i = 1:sysize
        Sf[i,i]=Ωf[i,i]/sin(Ωf[i,i]*t)
        Si[i,i]=Ωi[i,i]/sin(Ωi[i,i]*tp)
        Bf[i,i]=Ωf[i,i]/tan(Ωf[i,i]*t)
        Bi[i,i]=Ωi[i,i]/tan(Ωi[i,i]*tp)
    end
    W = [(Bf+J'*Bi*J)  -(Sf+J'*Si*J)
        -(Sf+J'*Si*J)  (Bf+J'*Bi*J)]
    U = Bi-Si
    V = [J'*U*D
        J'*U*D]

    ########## Defining the X and Y Matrices ##################################


    ################# Final Calculation and returning value ##################
    s=0
    MJSJ = J'*Si*J
    MJBJ = J'*Bi*J
    MJUD = J'*U*D
    for m in range(1,sysize)
        for mp in range(1,sysize)
            # @. X11 = 0.0
            # @. X12 = 0.0
            # @. X21 = 0.0
            # @. X22 = 0.0
            X11[m,:]=(MJSJ)[mp,:]*(-Bf[m,m])
            X12[m,:]=(MJBJ)[mp,:]*(Bf[m,m])
            X21[m,:]=(MJSJ)[mp,:]*(Sf[m,m])
            X22[m,:]=(MJBJ)[mp,:]*(-Sf[m,m])
            X = [X11 X12
                X21 X22]
            # @. Y1 = 0.0
            # @. Y2 = 0.0
            Y1[m,:] = Bf[m,m]*(MJUD)[mp,:] 
            Y2[m,:] = -Sf[m,m]*(MJUD)[mp,:]
            Y = [Y1
                Y2]
            g = im*tr(W\X)+    (transpose(W\V)*X*(W\V))[1]      -    (transpose(Y)*(W\V))[1]
            s = s+g[1]*gfcpart*T[m,mp]
        end
        for i in 1:sysize
            X11[m,i]=0
            X12[m,i]=0
            X21[m,i]=0
            X22[m,i]=0
            Y1[m,1]=0
            Y2[m,1]=0
        end
    end
    return s[1]
end

begin ## Define all constant variables
    const Delta::Float64 = 2.02 # in eV
    const dd::Int64 = 11                               # Change this to modify number of points
    const Temp::Int64=300
    kT = Temp * 1.380649 * 10^-23  # Boltzmann constant times temperature in Joules
    kT= kT * 2.29371227840*10^17         # in atomic units
    const b ::Float64= 1/kT
    const omegaf ::Diagonal{Float64}= Diagonal(vec(readdlm("wf.txt", '\t', Float64, '\n')))*4.55633*10^-6   # Final state  frequencies in atomic units
    const omegai ::Diagonal{Float64}= Diagonal(vec(readdlm("wi.txt", '\t', Float64, '\n')))*4.55633*10^-6  # Initial state  frequencies in atomic units
    const sysize ::Int64 = size(omegai)[1]
    const P = @. 2*sinh(b*omegai/2)
    T ::Matrix{Float64}= readdlm("nac.txt", '\t', Float64, '\n'); # NAC matrix
    T = T*T';
    const D ::Matrix{Float64}= readdlm("d.txt", Float64); # Displacement Matrix
    const J ::Matrix{Float64}= readdlm("j.txt", Float64); # Duchinsky Matrix
    t ::Vector{Float64}=  collect(range(-5*10^-12,stop=5*10^-12,length=dd))
    t[div(dd+1,2)] = 10e-25
end

function calc(t,b,Δ,sysize,Ωf,Ωi,J,D,P)
    nchunks = Threads.nthreads() # define number of chunks e.g. equal to number of threads
    # @sync tells Julia to wait for any tasks started inside it
    open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
        write(f, "STARTING LOOP WITH NUMBER OF THREADS = $(Threads.nthreads()) \n====================================================\n")
    end
    @sync for (xrange, ichunk) in chunks(range(1,dd), nchunks)
        # xrange are the indices this chunk contains, ichunk is just the index of the chunk, which we don't need in this scenario
        Threads.@spawn begin # this line tells Julia to start a new thread with the contents of the begin block
            Sf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            Si ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            Bf ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            Bi ::Diagonal{ComplexF64}= Diagonal(zeros(ComplexF64,sysize))
            X11 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            X12 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            X21 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            X22 :: Matrix{ComplexF64}  = zeros(ComplexF64,sysize,sysize)
            Y1  :: Matrix{ComplexF64} =  zeros(ComplexF64,sysize,1)
            Y2  :: Matrix{ComplexF64} =  zeros(ComplexF64,sysize,1)
            x = zeros(ComplexF64,dd)
            for i in xrange
                @time for i in 1:dd
                    gfcpart = GFC(t[i],b,Δ,sysize,Ωf,Ωi,J,D,P,Si,Sf,Bi,Bf)
                    gtotal = GHT(t[i],b,sysize,Ωf,Ωi,J,D,Si,Sf,Bi,Bf,X11,X12,X21,X22,Y1,Y2,gfcpart,T)
                    x[i] = gtotal
                    open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
                        write(f, "ITERATION $i COMPLETED BY PROCESS $(Threads.threadid()) \n")
                    end
                end  
            end # loop
        end # thread work load
    end # loop over chunks, Julia will wait until all task are completed
    open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
        write(f, "LOOP COMPLETE \n====================================================")
    end
    yr = real.(x)
    yi = imag.(x)
    p = plot(t,yr,linewidth = 2,color = "blue",title = "Δ = $Δ",label = "Real part",grid=false)
    p =plot!(t,yi,linewidth = 2,color = "red",label = "Imaginary part",size=(1000,800),xlims=(-30*10^-15,30*10^-15))
    savefig(p,"gt.svg")
    begin # This part does FFTW and finds final rate constant

        fs = abs(t[1]-t[2])^-1
        signal = yr
        p = plan_fft(signal)
        F = fftshift(p*signal)
        freqs = fftshift(fftfreq(length(t), fs))  # X axis has frequencies in s^-1
        #Covert freqs to eV
        freqs = freqs * 4.13558 * 10^-15
        F = F * 6.57 * 10^15
        time_domain = plot(t, signal, title = L"G(t)",legend=false,color="orange",xlabel=L"\mathrm{Time}\ (s)",ylabel=L"\mathrm{A.u}")
        freq_domain = plot(freqs, abs.(F), title = L"\mathrm{Fourier\ transformed\ g(\bar{\nu}))}",legend=false,xlabel=L"\mathrm{Energy\ (eV)}",ylabel=L"\mathrm{k_{IC}\ (s^{-1})}",xlims=(0,4))
        p = plot(time_domain, freq_domain, layout = (2,1), size = (1000, 800),right_margin = 10mm)
        savefig(p,"gt-fft.svg")
        ee = Δ# * 8065.73  #Converting eV to cm^-1
        index = findmin(abs.(freqs .- ee))[2]  # Find the index of freq closest to ee
        freqs[index]
        println("The rate constant is ",abs.(F)[index])
        rc = abs.(F)[index]
    end
    begin # This part writes the output to a file
        open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
            write(f, "-------------------------\n")
            write(f,"NUMBER OF CORES = $(Threads.nthreads())\n")
            write(f,"The value of Δ is $Δ\n")
            write(f,"Number of time points = $dd\n")
            write(f, "The rate constant is $rc\n")
            write(f,"-------------------------\n")
            write(f,"The values of t,Re(g(t)),Im(g(t)) are \n\n")
            for i in range(1,dd)
                write(f,"$(t[i])  $(yr[i])  $(yi[i])\n")
            end
        end
    end

end


timeneed = @elapsed calc(t,b,Delta,sysize,omegaf,omegai,J,D,P)
open("CORE-PARALLEL-OUTPUT-RV6.TXT", "a") do f
    write(f, "\n\n\n===============================================================\nThe time elapsed for main loop [INCLUDING GC + C + RC]=  $timeneed seconds \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
end

I tried running it with julia -t 1 but I noticed that it seems to be going on to an infinite loop, the output (directly generated from the above code is as follow)

STARTING LOOP WITH NUMBER OF THREADS = 1 
====================================================
ITERATION 1 COMPLETED BY PROCESS 1 
ITERATION 2 COMPLETED BY PROCESS 1 
ITERATION 3 COMPLETED BY PROCESS 1 
ITERATION 4 COMPLETED BY PROCESS 1 
ITERATION 5 COMPLETED BY PROCESS 1 
ITERATION 6 COMPLETED BY PROCESS 1 
ITERATION 7 COMPLETED BY PROCESS 1 
ITERATION 8 COMPLETED BY PROCESS 1 
ITERATION 9 COMPLETED BY PROCESS 1 
ITERATION 10 COMPLETED BY PROCESS 1 
ITERATION 11 COMPLETED BY PROCESS 1 
ITERATION 1 COMPLETED BY PROCESS 1 
ITERATION 2 COMPLETED BY PROCESS 1 
ITERATION 3 COMPLETED BY PROCESS 1 
ITERATION 4 COMPLETED BY PROCESS 1 
ITERATION 5 COMPLETED BY PROCESS 1

After completion of 11 iterations it again starts from the first when I set the number of threads = 1. I also do not understand the purpose (or lack theirof) of the variable ichunk