Memory issue that is solved by manually calling GC.gc()

Hi, so my problem is pretty much the title, currently my code runs and does not fail but I would quite like to get to the bottom of this as this I find it somewhat concerning. Sadly I cannot provide a MWE for this as it relies on quite a large codebase at this point, however I include at the end the main bit of the code which should give an illustration on roughly what is happening.

I am doing a parameter scan where for each combination of parameters I generate 100 random systems from a generator and for each I run a DifferentialEquations.jl solver for and do some further analysis. It is threaded using OhMyThreads.jl over the 100 random runs. I run this on a cluster and have it fail ~5 times when I was trying to find out what was happening, notably each run slightly different as I was debugging but all failed on a similar timescale with the following error

terminate called after throwing an instance of 'std::bad_alloc'
  what():  std::bad_alloc

[84958] signal (6.-6): Aborted
in expression starting at none:1
gsignal at /lib64/libc.so.6 (unknown line)
abort at /lib64/libc.so.6 (unknown line)
unknown function (ip: 0x7f7d8ef909ba)
unknown function (ip: 0x7f7d8efa0369)
_ZSt9terminatev at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libstdc++.so.6 (unknown line)
__cxa_throw at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libstdc++.so.6 (unknown line)
unknown function (ip: 0x7f7d8ef905c5)
unknown function (ip: 0x7f7d897e98ee)
_ZN4llvm17PMTopLevelManager11setLastUserENS_8ArrayRefIPNS_4PassEEES3_ at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm13PMDataManager3addEPNS_4PassEb at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm17PMTopLevelManager12schedulePassEPNS_4PassE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm16TargetPassConfig7addPassEPNS_4PassE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8c01f38c)
_ZN4llvm16TargetPassConfig16addMachinePassesEv at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm17LLVMTargetMachine17addPassesToEmitMCERNS_6legacy15PassManagerBaseERPNS_9MCContextERNS_17raw_pwrite_streamEb at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc14SimpleCompilerclERNS_6ModuleE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8dea5ace)
_ZN4llvm3orc14IRCompileLayer4emitESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EENS0_16ThreadSafeModuleE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16IRTransformLayer4emitESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EENS0_16ThreadSafeModuleE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8dea7dec)
_ZN4llvm3orc31BasicIRLayerMaterializationUnit11materializeESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc19MaterializationTask3runEv at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8b685d1c)
_ZN4llvm3orc16ExecutionSession22dispatchOutstandingMUsEv at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession17OL_completeLookupESt10unique_ptrINS0_21InProgressLookupStateESt14default_deleteIS3_EESt10shared_ptrINS0_23AsynchronousSymbolQueryEESt8functionIFvRKNS_8DenseMapIPNS0_8JITDylibENS_8DenseSetINS0_15SymbolStringPtrENS_12DenseMapInfoISF_vEEEENSG_ISD_vEENS_6detail12DenseMapPairISD_SI_EEEEEE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8b6a5a0c)
_ZN4llvm3orc16ExecutionSession19OL_applyQueryPhase1ESt10unique_ptrINS0_21InProgressLookupStateESt14default_deleteIS3_EENS_5ErrorE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupENS0_10LookupKindERKSt6vectorISt4pairIPNS0_8JITDylibENS0_19JITDylibLookupFlagsEESaIS8_EENS0_15SymbolLookupSetENS0_11SymbolStateENS_15unique_functionIFvNS_8ExpectedINS_8DenseMapINS0_15SymbolStringPtrENS_18JITEvaluatedSymbolENS_12DenseMapInfoISI_vEENS_6detail12DenseMapPairISI_SJ_EEEEEEEEESt8functionIFvRKNSH_IS6_NS_8DenseSetISI_SL_EENSK_IS6_vEENSN_IS6_SV_EEEEEE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupERKSt6vectorISt4pairIPNS0_8JITDylibENS0_19JITDylibLookupFlagsEESaIS7_EENS0_15SymbolLookupSetENS0_10LookupKindENS0_11SymbolStateESt8functionIFvRKNS_8DenseMapIS5_NS_8DenseSetINS0_15SymbolStringPtrENS_12DenseMapInfoISI_vEEEENSJ_IS5_vEENS_6detail12DenseMapPairIS5_SL_EEEEEE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8deade8e)
unknown function (ip: 0x7f7d8deb29fd)
unknown function (ip: 0x7f7d8deb3c7e)
jl_generate_fptr_impl at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-codegen.so.1.10 (unknown line)
jl_compile_method_internal at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
ijl_apply_generic at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
wait at ./task.jl:359 [inlined]
fetch at ./task.jl:379 [inlined]
fetch at /home/xucapjko/.julia/packages/StableTasks/YtV0L/src/internals.jl:23 [inlined]
macro expansion at ./reduce.jl:265 [inlined]
macro expansion at ./simdloop.jl:77 [inlined]
mapreduce_impl at ./reduce.jl:263
mapreduce_impl at ./reduce.jl:277 [inlined]
_mapreduce at ./reduce.jl:447
ijl_apply_generic at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
_mapreduce_dim at ./reducedim.jl:367
#mapreduce#821 at ./reducedim.jl:359 [inlined]
mapreduce at ./reducedim.jl:359
ijl_apply_generic at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
_tmapreduce at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:131
#tmapreduce#22 at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:100 [inlined]
tmapreduce at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:84 [inlined]
#tforeach#102 at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:391
terminate called recursively
/var/spool/slurmd/job33316/slurm_script: line 7: 84958 Aborted

Given the random nature of the problem yet the ~constant time at which the error happen plus the bad_alloc I started suspecting a memory issue. I increased the memory limit on the job to 200GB but the run still failed at about the same time - I find this very weird. In addition looking at the past job with sacct seems to say the maximum memory usage being only ~20GB (though note I am not particularly proficient with slurm). However what did fix the issue was adding a GC.gc() call to the parameter scan loop!! Hinting that this is in fact a memory issue but I really don’t understand how given that increasing memory changed nothing. I have come across some threads online mentioning past memory leak issues and wonder if this could be similar either directly on part of julia or a package I use.

Here’s a sketch of the main function body though it relies on other bits of code elsewhere

# This is the main function being called by the slurm script!
function main_run3_N10_testing()
    BLAS.set_num_threads(1)
    @time rslts = run3_testing(10, 100, 100.0, 1000;
        m=2 .^ range(-4, 2, 5),
        c=2 .^ range(-2, 6, 5),
        l=range(0.0, 1.0, 4)[1:end],
        si=range(0.0, 1.0, 5)[1:end],
        sr=range(0.0, 1.0, 5)[1:end],
        sb=range(0.0, 1.0, 5)[1:end],
    )
    save_object("./run3_N10_testing.jld2", rslts)
    rslts
end

"""
Does a scan for a single N while keeping the total influx energy rate at 1.0 for
each strain (hopefully valid through dimensional reduction) no matter the rest
of the params.
"""
function run3_testing(N, num_repeats=100, kmax=100, Nks=1000;
    m=[1.0],
    c=[1.0],
    l=[0.5],
    si=[1.0],
    sr=[0.5],
    sb=[1.0],
)
    function func(lm, lc, ll, lsi, lsr, lsb)
        total_influx = 1.0 * N # setting E (or E/V)
        Kmean = total_influx / (lsi * N)
        K = (Kmean, Kmean * sigma_to_mu_ratio1())

        GC.gc()
        rsg = RSGJans1(N, N;
            m=(lm, lm * sigma_to_mu_ratio1()),
            r=1.0, # setting T
            sparsity_influx=lsi,
            K,
            sparsity_resources=lsr,
            sparsity_byproducts=lsb,
            c=(lc, lc * sigma_to_mu_ratio1()),
            l=(ll, ll * sigma_to_mu_ratio1()),
            Ds=1e-8, Dr=1.0, # setting L plus assuming the specific values don't matter as long as Ds << Dr
        )
        raw_results = do_rg_run2(rsg, num_repeats, kmax, Nks;
            extinctthr=1e-8,
            maxresidthr=1e-8,
            abstol=1000 * eps(),
            reltol=1000 * eps(),
            timelimit=10 * 60.0,
            # debug_save_problem="debug_sp/"
        )
        countmap(raw_results)
    end
    scan_func(func, Dict{Int,Int}; m, c, l, si, sr, sb,
        progress=true,
        async_progress=60, # async progress report once every minute
    )
end

function scan_func(func, result_type;
    progress=true,
    async_progress=nothing,
    kwargs...
)
    params_prod = product(values(values(kwargs))...)
    params_size = size(params_prod)
    params_cis = CartesianIndices(params_size)

    num_runs = length(params_cis)
    if progress
        pi = 1
    end
    if !isnothing(async_progress)
        api = 1
        ap_running = true
        ap_task = Task(function ()
            while ap_running
                @printf "Working on run %d out of %d\n" api num_runs
                flush(stdout)
                sleep(async_progress)
            end
        end)
        schedule(ap_task)
    end

    results = Array{result_type}(undef, params_size)
    for (params, ci) in zip(params_prod, params_cis)
        results[ci] = func(params...)

        if progress
            @printf "Just finished run %d out of %d\n" pi num_runs
            pi += 1
            flush(stdout)
        end
        if !isnothing(async_progress)
            api += 1
        end
    end

    if !isnothing(async_progress)
        ap_running = false
        wait(ap_task)
    end

    DimArray(results, (; kwargs...))
end

function do_rg_run2(rg, num_repeats, kmax, Nks;
    maxresidthr=1e-7,            # will warn if ss residues are larger than this
    extinctthr=maxresidthr / 10, # species below this value are considered extinct
    lszerothr=1000 * eps(),      # values +- this are considered 0 in linstab analysis
    lspeakthr=lszerothr,
    # whether and which params to return for further examination (int <-> interesting)
    return_int=nothing,
    return_int_sss=true,
    debug_save_problem=nothing,
    # ss solver target tolerances and maxiters
    tol=maxresidthr / 10,
    timelimit=nothing, # time limit for one solver run in seconds
    abstol=tol,
    reltol=tol,
    maxiters=100000,
)
    sample_params = rg()
    Ns, Nr = get_Ns(sample_params)
    N = Ns + Nr

    # handle interesting systems setup
    int_func = if isnothing(return_int)
        nothing
    elseif isa(return_int, Vector) || isa(return_int, Tuple)
        c -> c in return_int
    elseif isa(return_int, Function)
        return_int
    elseif return_int == :all
        c -> true
    else
        throw(ArgumentError("return_interesting needs to be either a list of codes or a custom function"))
    end

    solver_kwargs = (; maxiters, abstol, reltol)
    if !isnothing(timelimit)
        solver_kwargs = (; solver_kwargs..., callback=make_timer_callback(timelimit))
    end

    # setup ks for linstab analysis
    ks = LinRange(0.0, kmax, Nks)[2:end] # 0 is handled separately

    # setup the returned data containers
    rslts = fill(0, num_repeats)

    # these may not be used, skipping the if to not have them boxed
    int_lock = ReentrantLock()
    int_systems_to_return = typeof(sample_params)[]
    int_systems_sss = Vector{Float64}[]

    debug_save_lock = ReentrantLock()

    # the core of the function
    @localize solver_kwargs @tasks for i in 1:num_repeats
        # Prealloc variables in each thread (task)
        @local begin
            M1 = Matrix{Float64}(undef, N, N)
            M = Matrix{Float64}(undef, N, N)
            mrls = Vector{Float64}(undef, length(ks))
        end

        # Setup one random system
        params = rg()
        u0 = ModifiedMiCRM.make_u0_onlyN(params)
        ssp = make_mmicrm_ss_problem(params, u0)

        if !isnothing(debug_save_problem)
            lock(debug_save_lock) do
                fname = debug_save_problem * string(i) * ".jld2"
                try
                    save_object(fname, ssp)
                catch
                    @error (@sprintf "Failed to save to file %s" fname)
                end
            end
        end

        result = 0
        warning = false

        ######################################## 

        # numerically solve for the steady state
        ssps = solve(ssp, DynamicSS(TRBDF2()); solver_kwargs...)

        # Check the solver
        if !SciMLBase.successful_retcode(ssps.retcode)
            result = -1000 # solver failed return code
            result -= Int(ssps.original.retcode)
            if ssps.original.retcode == ReturnCode.MaxTime
                @warn "Solver quit due to time limit being reached"
                flush(stderr)
            end
            @goto handle_result
        end
        # Check that the steady state is steady enough
        maxresid = maximum(abs, ssps.resid)
        if maxresid > maxresidthr
            @warn (@sprintf "maxresid reached is %g which is close to %g" maxresid maxresidthr)
            warning = true
        end

        # Check for a full extinction
        if all(x -> abs(x) < extinctthr, ssps.u[1:Ns])
            result = 101 # gone extinct in nospace ss
            @goto handle_result
        end

        # Do linear stability
        # handle the k=0 case
        make_M1!(M1, params, ssps.u)
        k0mrl = maximum(real, eigvals!(M1))

        # calculate mrls
        make_M1!(M1, params, ssps.u)
        for (ki, k) in enumerate(ks)
            M .= M1
            M1_to_M!(M, get_Ds(params), k)
            evals = eigvals!(M)
            mrls[ki] = maximum(real, evals)
        end

        # evaluate the mrl results
        maxmrl, maxi = findmax(mrls)

        if k0mrl < -lszerothr # this is the ideal case
            if maxmrl < -lspeakthr
                result = 1
                @goto handle_result
            else
                result = 2
                @goto handle_result
            end
        elseif k0mrl < lszerothr # this can happen when there are interchangeable species, or when close to numerical issues
            if maxmrl < -lspeakthr
                result = 11
                @goto handle_result
            else
                is_separated = false
                for intermediate_mrl in mrls[1:maxi]
                    if intermediate_mrl < -lszerothr
                        is_separated = true
                        break
                    end
                end
                if is_separated # k0 is sketchy but we have a separated positive peak
                    result = 12
                    @goto handle_result
                else # the largest peak is connected to a positive k0 - clearly messy
                    result = 13
                    @goto handle_result
                end
            end
        else # something is definitely off here, however still do the same analysis for extra info
            if maxmrl < lspeakthr
                result = 21
                @goto handle_result
            else
                is_separated = false
                for intermediate_mrl in mrls[1:maxi]
                    if intermediate_mrl < -lszerothr
                        is_separated = true
                        break
                    end
                end
                if is_separated
                    result = 22
                    @goto handle_result
                else
                    result = 23
                    @goto handle_result
                end
            end
        end

        ######################################## 

        @label handle_result
        if warning
            result *= -1
        end
        rslts[i] = result
        if !isnothing(int_func) && int_func(result)
            lock(int_lock) do
                push!(int_systems_to_return, params)
                if return_int_sss
                    push!(int_systems_sss, ssps.u)
                end
            end
        end
    end

    if isnothing(int_func)
        rslts
    else
        if !return_int_sss
            rslts, int_systems_to_return
        else
            rslts, int_systems_to_return, int_systems_sss
        end
    end
end

function sigma_to_mu_ratio1()
    (2 / 3) / 2.355
end

Finally, I use the lts julia version 1.10.9 with all packages updated but I can’t add them here as it goes over the post character limit and I can’t attach a basic text file.

Any tips on what this could be?