Improve performance of a code with very large % of Garbage Collection

@odow, one question. Is there any way to wra[ your excellent solution in another parallel loop. I am trying to use the MWE below, but I think this is not a proper solution for a nested parallelized loop.

using Distributed, BenchmarkTools, SharedArrays

addprocs(8)


@everywhere begin
    using JuMP, MosekTools, NLopt
    import Distributions
    import Random
end

@everywhere struct Data
    N::Int
    I::Int
    J::Int
    S::Int
    σ::Int
    α::Float64
    z::Array{Float64}
    β::Vector{Float64}
    τ::Matrix{Float64}
end

@everywhere function f2(data::Data, Cap::Vector{Float64}, s::Int, n::Int)
    opt = JuMP.Model(optimizer_with_attributes(Mosek.Optimizer, "QUIET" => false, "INTPNT_CO_TOL_DFEAS" => 1e-7))
    set_silent(opt)
    σ′ = ((data.σ - 1) / data.σ)^(data.σ - 1) / data.σ
    @variable(opt, ν[1:data.I] >= 0)
    @variable(opt, μ[1:data.J] >= 0)
    @variable(opt, t[1:data.J] >= 0)
    @objective(
        opt,
        Min,
        sum(Cap[i] * ν[i] for i in 1:data.I) +
        sum(σ′ * data.β[j]^data.σ * t[j] for j in 1:data.J),
    )
    @constraint(
        opt,
        [i = 1:data.I, j = 1:data.J],
        data.τ[j, i] * ν[i] - μ[j] >= -data.τ[j, i] / data.z[i, s, n],
    )
    @constraint(
        opt,
        [j = 1:data.J],
        [t[j], μ[j], 1.0] in MOI.PowerCone(1 / data.σ)
    )
    JuMP.optimize!(opt)
    return objective_value(opt), value.(ν)
end

function generate_data(;N=3, I = 250, J = 50, S = 100)
    σ = 5
    α = 0.15
    Random.seed!(12345)
    z = exp.(rand(Distributions.Normal(-2.5 / 2, sqrt(2.5)), I, S, N)) .+ rand(I)
    #z = ones(I,S)
    β = ones(J)
    function distmat(J, I)
        coordinates_market = hcat(1:J, zeros(J))
        coordinates_plant = hcat((1:I) .* J / I, zeros(I))
        return [
            1 + sqrt(
                (coordinates_market[j, 1] - coordinates_plant[l, 1])^2 +
                (coordinates_market[j, 2] - coordinates_plant[l, 2])^2
            )
            for j = 1:J, l = 1:I
        ]
    end
    τ = distmat(J, I)
    return Data(N, I, J, S, 5, 0.15, z, β, τ)
end

function f1(data::Data, x::Vector{Float64}, n::Int)
    Eπ, Eν = 0.0, zeros(data.I)
    for s in 1:data.S
        temp = f2(data, x, s, n)
        Eπ += temp[1]
        Eν .+= temp[2]
    end
    return Eπ / data.S - data.α * sum(x), Eν ./ data.S .- data.α
end

@everywhere function parallel_f1(data::Data, x::Vector{Float64}, n::Int)
    pool = CachingPool(workers())
    let data = data
        ret = pmap(s -> f2(data, x, s, n), pool, 1:data.S)
        Eπ = sum(r[1] for r in ret) / data.S - data.α * sum(x)
        Eν = sum(r[2] for r in ret) ./ data.S .- data.α
        return Eπ , Eν 
    end
end

data = generate_data(N = 3, I = 25, J = 5, S = 10)


@everywhere function tempfunc(x::Vector, grad::Vector, n::Int)
    temp = parallel_f1(data,x,n)
    if length(grad) > 0
        grad[:]=-temp[2]
    end
    return -temp[1]
end

@everywhere function opt_n(data::Data, guess::Vector{Float64}, n::Int)
    opt = Opt(:LD_MMA, data.I)
    opt.lower_bounds = zeros(data.I)
    opt.xtol_rel = 1e-3
    opt.ftol_rel = 1e-8

    opt.min_objective = (x,grad)->tempfunc(x,grad,n)

    (minf,minx,ret) = NLopt.optimize(opt, guess)
    numevals = opt.numevals # the number of function evaluations
    #println("got $minf at $minx after $numevals iterations in (returned $ret)")

    return minx
end    


function big_loop(data)
       solution = Array{Float64,2}(undef,data.I,data.N)
    for n = 1 : data.N
        solution[:,n] = opt_n(data, ones(data.I), n)
    end    
    return solution
end    

function parallel_big_loop(data)
    solution = SharedArray{Float64}(data.I,data.N)
    @sync @distributed for n = 1 : data.N
        solution[:,n] = opt_n(data, ones(data.I), n)
    end    
    return solution
end  

sol = @btime opt_n(data,ones(data.I),1)

TEST1 = @btime big_loop(data)


TEST2 = @btime parallel_big_loop(data)


TEST1 == TEST2

I had already asked a related question in this post, and I got a solution in terms of @floops, but of course did not have the pmap on the inner loop that is creating such great results.