I’m having trouble [minimally] reproducing this locally.
It happens for maybe 0.1-5% of array tasks which execute initShared()
from unregistered package pqADMM
and causes the slurm error shown below for that respective task:
slurm array task error output
Worker 3 terminated.
ERROR: LoadError: ProcessExitedException(3)
Stacktrace:
[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
@ Base ./task.jl:705
[2] wait
@ ./task.jl:764 [inlined]
[3] wait(c::Base.GenericCondition{ReentrantLock})
@ Base ./condition.jl:106
[4] take_buffered(c::Channel{Any})
@ Base ./channels.jl:389
[5] take!(c::Channel{Any})
@ Base ./channels.jl:383
[6] take!(::Distributed.RemoteValue)
@ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:599
[7] remotecall_fetch(::Function, ::Distributed.Worker, ::Distributed.RRID, ::Vararg{Any, N} where N; kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:390
[8] remotecall_fetch(::Function, ::Distributed.Worker, ::Distributed.RRID, ::Vararg{Any, N} where N)
@ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:386
[9] remotecall_fetch(::Function, ::Int64, ::Distributed.RRID, ::Vararg{Any, N} where N; kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:421
[10] remotecall_fetch
@ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:421 [inlined]
[11] call_on_owner
@ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:494 [inlined]
[12] wait(r::Future)
@ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:515
[13] SharedMatrix{Float32}(dims::Tuple{Int64, Int64}; init::Bool, pids::Vector{Int64})
@ SharedArrays /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/SharedArrays/src/SharedArrays.jl:137
[14] SharedArray
@ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/SharedArrays/src/SharedArrays.jl:105 [inlined]
[15] SharedMatrix{Float32}(A::Matrix{Float64})
@ SharedArrays /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/SharedArrays/src/SharedArrays.jl:366
[16] initShared(corpus::Vector{Any}, partition::Vector{UnitRange{Int64}}, k::Int64, l::Int64, alphalocal::Vector{Float64}, betalocal::Matrix{Float64}; statefloat::Type, stateint::Type, sstatfloat::Type, sstatint::Type)
@ pqADMM ~/.julia/packages/pqADMM/TjzGD/src/partition.jl:122
[17] runChainSharedSV(corpus0::Vector{Any}, partition::Vector{UnitRange{Int64}}, partmap::Vector{Int64}, betalocal::Matrix{Float64}, alphalocal::Vector{Float64}, k::Int64, nlex::Int64, philatent::Int64, stateint::Type, statefloat::Type, nworkers_local::Int64, chainblockn::Int64, n_iter::Int64, chain_i::Int64, drnm::String, initcorp::String; verbose::Bool)
@ pqADMM ~/.julia/packages/pqADMM/TjzGD/src/sampling.jl:200
[18] scLDA_E_step(datadir::String; C_fn::String, T_fn::String, pDataC_fn::String, emIter::String, outputdir::String, remcolC::Bool, remcolPDC::Bool, remcolT::Bool, remcolTT::Bool, ldamodelname::String, niter::Int64, nparts::Int64, nchains::Int64, blocksize::Int64, betapseudo::Float64, betaeps::Float64, alpha::Float64, nscthresh::Float64, genethresh::Float64, protectedgenes::Vector{Any}, scalesc::String, scalebulk::String, bulkfactor::Float64, scfactor::Float64, initflavor::String, verbose::Bool, philatent::Int64, thinning::Int64, rmchains::Bool, burn::Float64, thetabenchmark::String, mcplot::Bool, runqc::Bool)
@ pqADMM ~/.julia/packages/pqADMM/TjzGD/src/extra.jl:514
[19] main()
@ Main /dfs5/bio/mkarikom/code/DTMwork/slurm/symsim_library/scldaCSV.jl:41
[20] top-level scope
@ /dfs5/bio/mkarikom/code/DTMwork/slurm/symsim_library/scldaCSV.jl:264
in expression starting at /dfs5/bio/mkarikom/code/DTMwork/slurm/symsim_library/scldaCSV.jl:264
Line 122 of partition.jl
referenced in error line 15 above corresponds to:
return (T=SharedArray(T),A=SharedArray(A),D=SharedArray(D),Z=SharedArray(Z),J=SharedArray(J),W=SharedArray(W),
nj=SharedArray{sstatint,2}(nj),nwinc=SharedArray{sstatint,2}(nwinc),
nw=SharedArray{sstatint,2}(nw),
alpha=SharedArray{sstatfloat,1}(alphalocal),beta=SharedArray{sstatfloat,2}(betalocal),
indt=SharedArray(indt),indnw=SharedArray(indnw))