Hi, so my problem is pretty much the title, currently my code runs and does not fail but I would quite like to get to the bottom of this as this I find it somewhat concerning. Sadly I cannot provide a MWE for this as it relies on quite a large codebase at this point, however I include at the end the main bit of the code which should give an illustration on roughly what is happening.
I am doing a parameter scan where for each combination of parameters I generate 100 random systems from a generator and for each I run a DifferentialEquations.jl solver for and do some further analysis. It is threaded using OhMyThreads.jl over the 100 random runs. I run this on a cluster and have it fail ~5 times when I was trying to find out what was happening, notably each run slightly different as I was debugging but all failed on a similar timescale with the following error
terminate called after throwing an instance of 'std::bad_alloc'
what(): std::bad_alloc
[84958] signal (6.-6): Aborted
in expression starting at none:1
gsignal at /lib64/libc.so.6 (unknown line)
abort at /lib64/libc.so.6 (unknown line)
unknown function (ip: 0x7f7d8ef909ba)
unknown function (ip: 0x7f7d8efa0369)
_ZSt9terminatev at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libstdc++.so.6 (unknown line)
__cxa_throw at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libstdc++.so.6 (unknown line)
unknown function (ip: 0x7f7d8ef905c5)
unknown function (ip: 0x7f7d897e98ee)
_ZN4llvm17PMTopLevelManager11setLastUserENS_8ArrayRefIPNS_4PassEEES3_ at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm13PMDataManager3addEPNS_4PassEb at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm17PMTopLevelManager12schedulePassEPNS_4PassE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm16TargetPassConfig7addPassEPNS_4PassE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8c01f38c)
_ZN4llvm16TargetPassConfig16addMachinePassesEv at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm17LLVMTargetMachine17addPassesToEmitMCERNS_6legacy15PassManagerBaseERPNS_9MCContextERNS_17raw_pwrite_streamEb at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc14SimpleCompilerclERNS_6ModuleE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8dea5ace)
_ZN4llvm3orc14IRCompileLayer4emitESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EENS0_16ThreadSafeModuleE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16IRTransformLayer4emitESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EENS0_16ThreadSafeModuleE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8dea7dec)
_ZN4llvm3orc31BasicIRLayerMaterializationUnit11materializeESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc19MaterializationTask3runEv at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8b685d1c)
_ZN4llvm3orc16ExecutionSession22dispatchOutstandingMUsEv at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession17OL_completeLookupESt10unique_ptrINS0_21InProgressLookupStateESt14default_deleteIS3_EESt10shared_ptrINS0_23AsynchronousSymbolQueryEESt8functionIFvRKNS_8DenseMapIPNS0_8JITDylibENS_8DenseSetINS0_15SymbolStringPtrENS_12DenseMapInfoISF_vEEEENSG_ISD_vEENS_6detail12DenseMapPairISD_SI_EEEEEE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8b6a5a0c)
_ZN4llvm3orc16ExecutionSession19OL_applyQueryPhase1ESt10unique_ptrINS0_21InProgressLookupStateESt14default_deleteIS3_EENS_5ErrorE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupENS0_10LookupKindERKSt6vectorISt4pairIPNS0_8JITDylibENS0_19JITDylibLookupFlagsEESaIS8_EENS0_15SymbolLookupSetENS0_11SymbolStateENS_15unique_functionIFvNS_8ExpectedINS_8DenseMapINS0_15SymbolStringPtrENS_18JITEvaluatedSymbolENS_12DenseMapInfoISI_vEENS_6detail12DenseMapPairISI_SJ_EEEEEEEEESt8functionIFvRKNSH_IS6_NS_8DenseSetISI_SL_EENSK_IS6_vEENSN_IS6_SV_EEEEEE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupERKSt6vectorISt4pairIPNS0_8JITDylibENS0_19JITDylibLookupFlagsEESaIS7_EENS0_15SymbolLookupSetENS0_10LookupKindENS0_11SymbolStateESt8functionIFvRKNS_8DenseMapIS5_NS_8DenseSetINS0_15SymbolStringPtrENS_12DenseMapInfoISI_vEEEENSJ_IS5_vEENS_6detail12DenseMapPairIS5_SL_EEEEEE at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libLLVM-15jl.so (unknown line)
unknown function (ip: 0x7f7d8deade8e)
unknown function (ip: 0x7f7d8deb29fd)
unknown function (ip: 0x7f7d8deb3c7e)
jl_generate_fptr_impl at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-codegen.so.1.10 (unknown line)
jl_compile_method_internal at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
ijl_apply_generic at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
wait at ./task.jl:359 [inlined]
fetch at ./task.jl:379 [inlined]
fetch at /home/xucapjko/.julia/packages/StableTasks/YtV0L/src/internals.jl:23 [inlined]
macro expansion at ./reduce.jl:265 [inlined]
macro expansion at ./simdloop.jl:77 [inlined]
mapreduce_impl at ./reduce.jl:263
mapreduce_impl at ./reduce.jl:277 [inlined]
_mapreduce at ./reduce.jl:447
ijl_apply_generic at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
_mapreduce_dim at ./reducedim.jl:367
#mapreduce#821 at ./reducedim.jl:359 [inlined]
mapreduce at ./reducedim.jl:359
ijl_apply_generic at /home/xucapjko/.julia/juliaup/julia-1.10.9+0.x64.linux.gnu/bin/../lib/julia/libjulia-internal.so.1.10 (unknown line)
_tmapreduce at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:131
#tmapreduce#22 at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:100 [inlined]
tmapreduce at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:84 [inlined]
#tforeach#102 at /home/xucapjko/.julia/packages/OhMyThreads/P7uYA/src/implementation.jl:391
terminate called recursively
/var/spool/slurmd/job33316/slurm_script: line 7: 84958 Aborted
Given the random nature of the problem yet the ~constant time at which the error happen plus the bad_alloc I started suspecting a memory issue. I increased the memory limit on the job to 200GB but the run still failed at about the same time - I find this very weird. In addition looking at the past job with sacct seems to say the maximum memory usage being only ~20GB (though note I am not particularly proficient with slurm). However what did fix the issue was adding a GC.gc() call to the parameter scan loop!! Hinting that this is in fact a memory issue but I really don’t understand how given that increasing memory changed nothing. I have come across some threads online mentioning past memory leak issues and wonder if this could be similar either directly on part of julia or a package I use.
Here’s a sketch of the main function body though it relies on other bits of code elsewhere
# This is the main function being called by the slurm script!
function main_run3_N10_testing()
BLAS.set_num_threads(1)
@time rslts = run3_testing(10, 100, 100.0, 1000;
m=2 .^ range(-4, 2, 5),
c=2 .^ range(-2, 6, 5),
l=range(0.0, 1.0, 4)[1:end],
si=range(0.0, 1.0, 5)[1:end],
sr=range(0.0, 1.0, 5)[1:end],
sb=range(0.0, 1.0, 5)[1:end],
)
save_object("./run3_N10_testing.jld2", rslts)
rslts
end
"""
Does a scan for a single N while keeping the total influx energy rate at 1.0 for
each strain (hopefully valid through dimensional reduction) no matter the rest
of the params.
"""
function run3_testing(N, num_repeats=100, kmax=100, Nks=1000;
m=[1.0],
c=[1.0],
l=[0.5],
si=[1.0],
sr=[0.5],
sb=[1.0],
)
function func(lm, lc, ll, lsi, lsr, lsb)
total_influx = 1.0 * N # setting E (or E/V)
Kmean = total_influx / (lsi * N)
K = (Kmean, Kmean * sigma_to_mu_ratio1())
GC.gc()
rsg = RSGJans1(N, N;
m=(lm, lm * sigma_to_mu_ratio1()),
r=1.0, # setting T
sparsity_influx=lsi,
K,
sparsity_resources=lsr,
sparsity_byproducts=lsb,
c=(lc, lc * sigma_to_mu_ratio1()),
l=(ll, ll * sigma_to_mu_ratio1()),
Ds=1e-8, Dr=1.0, # setting L plus assuming the specific values don't matter as long as Ds << Dr
)
raw_results = do_rg_run2(rsg, num_repeats, kmax, Nks;
extinctthr=1e-8,
maxresidthr=1e-8,
abstol=1000 * eps(),
reltol=1000 * eps(),
timelimit=10 * 60.0,
# debug_save_problem="debug_sp/"
)
countmap(raw_results)
end
scan_func(func, Dict{Int,Int}; m, c, l, si, sr, sb,
progress=true,
async_progress=60, # async progress report once every minute
)
end
function scan_func(func, result_type;
progress=true,
async_progress=nothing,
kwargs...
)
params_prod = product(values(values(kwargs))...)
params_size = size(params_prod)
params_cis = CartesianIndices(params_size)
num_runs = length(params_cis)
if progress
pi = 1
end
if !isnothing(async_progress)
api = 1
ap_running = true
ap_task = Task(function ()
while ap_running
@printf "Working on run %d out of %d\n" api num_runs
flush(stdout)
sleep(async_progress)
end
end)
schedule(ap_task)
end
results = Array{result_type}(undef, params_size)
for (params, ci) in zip(params_prod, params_cis)
results[ci] = func(params...)
if progress
@printf "Just finished run %d out of %d\n" pi num_runs
pi += 1
flush(stdout)
end
if !isnothing(async_progress)
api += 1
end
end
if !isnothing(async_progress)
ap_running = false
wait(ap_task)
end
DimArray(results, (; kwargs...))
end
function do_rg_run2(rg, num_repeats, kmax, Nks;
maxresidthr=1e-7, # will warn if ss residues are larger than this
extinctthr=maxresidthr / 10, # species below this value are considered extinct
lszerothr=1000 * eps(), # values +- this are considered 0 in linstab analysis
lspeakthr=lszerothr,
# whether and which params to return for further examination (int <-> interesting)
return_int=nothing,
return_int_sss=true,
debug_save_problem=nothing,
# ss solver target tolerances and maxiters
tol=maxresidthr / 10,
timelimit=nothing, # time limit for one solver run in seconds
abstol=tol,
reltol=tol,
maxiters=100000,
)
sample_params = rg()
Ns, Nr = get_Ns(sample_params)
N = Ns + Nr
# handle interesting systems setup
int_func = if isnothing(return_int)
nothing
elseif isa(return_int, Vector) || isa(return_int, Tuple)
c -> c in return_int
elseif isa(return_int, Function)
return_int
elseif return_int == :all
c -> true
else
throw(ArgumentError("return_interesting needs to be either a list of codes or a custom function"))
end
solver_kwargs = (; maxiters, abstol, reltol)
if !isnothing(timelimit)
solver_kwargs = (; solver_kwargs..., callback=make_timer_callback(timelimit))
end
# setup ks for linstab analysis
ks = LinRange(0.0, kmax, Nks)[2:end] # 0 is handled separately
# setup the returned data containers
rslts = fill(0, num_repeats)
# these may not be used, skipping the if to not have them boxed
int_lock = ReentrantLock()
int_systems_to_return = typeof(sample_params)[]
int_systems_sss = Vector{Float64}[]
debug_save_lock = ReentrantLock()
# the core of the function
@localize solver_kwargs @tasks for i in 1:num_repeats
# Prealloc variables in each thread (task)
@local begin
M1 = Matrix{Float64}(undef, N, N)
M = Matrix{Float64}(undef, N, N)
mrls = Vector{Float64}(undef, length(ks))
end
# Setup one random system
params = rg()
u0 = ModifiedMiCRM.make_u0_onlyN(params)
ssp = make_mmicrm_ss_problem(params, u0)
if !isnothing(debug_save_problem)
lock(debug_save_lock) do
fname = debug_save_problem * string(i) * ".jld2"
try
save_object(fname, ssp)
catch
@error (@sprintf "Failed to save to file %s" fname)
end
end
end
result = 0
warning = false
########################################
# numerically solve for the steady state
ssps = solve(ssp, DynamicSS(TRBDF2()); solver_kwargs...)
# Check the solver
if !SciMLBase.successful_retcode(ssps.retcode)
result = -1000 # solver failed return code
result -= Int(ssps.original.retcode)
if ssps.original.retcode == ReturnCode.MaxTime
@warn "Solver quit due to time limit being reached"
flush(stderr)
end
@goto handle_result
end
# Check that the steady state is steady enough
maxresid = maximum(abs, ssps.resid)
if maxresid > maxresidthr
@warn (@sprintf "maxresid reached is %g which is close to %g" maxresid maxresidthr)
warning = true
end
# Check for a full extinction
if all(x -> abs(x) < extinctthr, ssps.u[1:Ns])
result = 101 # gone extinct in nospace ss
@goto handle_result
end
# Do linear stability
# handle the k=0 case
make_M1!(M1, params, ssps.u)
k0mrl = maximum(real, eigvals!(M1))
# calculate mrls
make_M1!(M1, params, ssps.u)
for (ki, k) in enumerate(ks)
M .= M1
M1_to_M!(M, get_Ds(params), k)
evals = eigvals!(M)
mrls[ki] = maximum(real, evals)
end
# evaluate the mrl results
maxmrl, maxi = findmax(mrls)
if k0mrl < -lszerothr # this is the ideal case
if maxmrl < -lspeakthr
result = 1
@goto handle_result
else
result = 2
@goto handle_result
end
elseif k0mrl < lszerothr # this can happen when there are interchangeable species, or when close to numerical issues
if maxmrl < -lspeakthr
result = 11
@goto handle_result
else
is_separated = false
for intermediate_mrl in mrls[1:maxi]
if intermediate_mrl < -lszerothr
is_separated = true
break
end
end
if is_separated # k0 is sketchy but we have a separated positive peak
result = 12
@goto handle_result
else # the largest peak is connected to a positive k0 - clearly messy
result = 13
@goto handle_result
end
end
else # something is definitely off here, however still do the same analysis for extra info
if maxmrl < lspeakthr
result = 21
@goto handle_result
else
is_separated = false
for intermediate_mrl in mrls[1:maxi]
if intermediate_mrl < -lszerothr
is_separated = true
break
end
end
if is_separated
result = 22
@goto handle_result
else
result = 23
@goto handle_result
end
end
end
########################################
@label handle_result
if warning
result *= -1
end
rslts[i] = result
if !isnothing(int_func) && int_func(result)
lock(int_lock) do
push!(int_systems_to_return, params)
if return_int_sss
push!(int_systems_sss, ssps.u)
end
end
end
end
if isnothing(int_func)
rslts
else
if !return_int_sss
rslts, int_systems_to_return
else
rslts, int_systems_to_return, int_systems_sss
end
end
end
function sigma_to_mu_ratio1()
(2 / 3) / 2.355
end
Finally, I use the lts julia version 1.10.9 with all packages updated but I can’t add them here as it goes over the post character limit and I can’t attach a basic text file.
Any tips on what this could be?