Occasional SharedArray errors on slurm

I’m having trouble [minimally] reproducing this locally.

It happens for maybe 0.1-5% of array tasks which execute initShared() from unregistered package pqADMM and causes the slurm error shown below for that respective task:

slurm array task error output
Worker 3 terminated.
ERROR: LoadError: ProcessExitedException(3)
Stacktrace:
  [1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
    @ Base ./task.jl:705
  [2] wait
    @ ./task.jl:764 [inlined]
  [3] wait(c::Base.GenericCondition{ReentrantLock})
    @ Base ./condition.jl:106
  [4] take_buffered(c::Channel{Any})
    @ Base ./channels.jl:389
  [5] take!(c::Channel{Any})
    @ Base ./channels.jl:383
  [6] take!(::Distributed.RemoteValue)
    @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:599
  [7] remotecall_fetch(::Function, ::Distributed.Worker, ::Distributed.RRID, ::Vararg{Any, N} where N; kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:390
  [8] remotecall_fetch(::Function, ::Distributed.Worker, ::Distributed.RRID, ::Vararg{Any, N} where N)
    @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:386
  [9] remotecall_fetch(::Function, ::Int64, ::Distributed.RRID, ::Vararg{Any, N} where N; kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:421
 [10] remotecall_fetch
    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:421 [inlined]
 [11] call_on_owner
    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:494 [inlined]
 [12] wait(r::Future)
    @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/remotecall.jl:515
 [13] SharedMatrix{Float32}(dims::Tuple{Int64, Int64}; init::Bool, pids::Vector{Int64})
    @ SharedArrays /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/SharedArrays/src/SharedArrays.jl:137
 [14] SharedArray
    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/SharedArrays/src/SharedArrays.jl:105 [inlined]
 [15] SharedMatrix{Float32}(A::Matrix{Float64})
    @ SharedArrays /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/SharedArrays/src/SharedArrays.jl:366
 [16] initShared(corpus::Vector{Any}, partition::Vector{UnitRange{Int64}}, k::Int64, l::Int64, alphalocal::Vector{Float64}, betalocal::Matrix{Float64}; statefloat::Type, stateint::Type, sstatfloat::Type, sstatint::Type)
    @ pqADMM ~/.julia/packages/pqADMM/TjzGD/src/partition.jl:122
 [17] runChainSharedSV(corpus0::Vector{Any}, partition::Vector{UnitRange{Int64}}, partmap::Vector{Int64}, betalocal::Matrix{Float64}, alphalocal::Vector{Float64}, k::Int64, nlex::Int64, philatent::Int64, stateint::Type, statefloat::Type, nworkers_local::Int64, chainblockn::Int64, n_iter::Int64, chain_i::Int64, drnm::String, initcorp::String; verbose::Bool)
    @ pqADMM ~/.julia/packages/pqADMM/TjzGD/src/sampling.jl:200
 [18] scLDA_E_step(datadir::String; C_fn::String, T_fn::String, pDataC_fn::String, emIter::String, outputdir::String, remcolC::Bool, remcolPDC::Bool, remcolT::Bool, remcolTT::Bool, ldamodelname::String, niter::Int64, nparts::Int64, nchains::Int64, blocksize::Int64, betapseudo::Float64, betaeps::Float64, alpha::Float64, nscthresh::Float64, genethresh::Float64, protectedgenes::Vector{Any}, scalesc::String, scalebulk::String, bulkfactor::Float64, scfactor::Float64, initflavor::String, verbose::Bool, philatent::Int64, thinning::Int64, rmchains::Bool, burn::Float64, thetabenchmark::String, mcplot::Bool, runqc::Bool)
    @ pqADMM ~/.julia/packages/pqADMM/TjzGD/src/extra.jl:514
 [19] main()
    @ Main /dfs5/bio/mkarikom/code/DTMwork/slurm/symsim_library/scldaCSV.jl:41
 [20] top-level scope
    @ /dfs5/bio/mkarikom/code/DTMwork/slurm/symsim_library/scldaCSV.jl:264
in expression starting at /dfs5/bio/mkarikom/code/DTMwork/slurm/symsim_library/scldaCSV.jl:264

Line 122 of partition.jl referenced in error line 15 above corresponds to:

    return (T=SharedArray(T),A=SharedArray(A),D=SharedArray(D),Z=SharedArray(Z),J=SharedArray(J),W=SharedArray(W),
            nj=SharedArray{sstatint,2}(nj),nwinc=SharedArray{sstatint,2}(nwinc),
            nw=SharedArray{sstatint,2}(nw),
            alpha=SharedArray{sstatfloat,1}(alphalocal),beta=SharedArray{sstatfloat,2}(betalocal),
            indt=SharedArray(indt),indnw=SharedArray(indnw))

Our data center is still looking into this but it may have been an isolated issue with the tmpfs filesystem on one of the nodes not staying in sync with the scheduler.