DataLoaders.jl Workers systematically end up outside bounds

Hello fellow Julians. I am trying to train a model using a parallel DataLoader, however, following the example from the documentation (available here: https://github.com/lorenzoh/DataLoaders.jl/blob/master/docs/datacontainers.md) fails.

The general idea is training a model but only loading data to memory before using it, since loading the entire dataset requieres more RAM than the one available.

When iterating using the dataloader, the workers systematically end up outside the bounds defined by nobs() and the @async never really finishes but waits eternally for the failed workers.
I have a MWE available here https://github.com/ClarkGuilty/PreliminaryJuliaTests/blob/master/DataLoading.jl.

Are you successfully using DataLoaders.jl without any modification on the source?

3 Likes

Hi Clark, author here. The issue seems to be a discrepancy in how MLDataPattern.BatchView and DataLoaders.BatchViewCollated interpret nobs. For the latter, every batch is one observation, but the former treats counts every single observation.

For your use case, passing collate = true to the DataLoader call should fix it; this is probably what you want anyway if you’re training on batches. Let me know if you run into any other problems. I’ll raise the inconsistency as an issue.

1 Like

Hi HolyLorenzo, thank you for your answer, and your package is a blessing. Regarding your suggestion, I changed it to collate = true but had an error relating to the copyrec!

Base.StackTraces.StackFrame[(::DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}})(args::Int64) at workerpool.jl:55, macro expansion at workerpool.jl:69 [inlined], #8 at macros.jl:19 [inlined], #62 at qpool.jl:195 [inlined], (::ThreadPools.var"#58#59"{ThreadPools.var"#62#64"{DataLoaders.var"#8#14"{DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}}}}, Tuple{Int64, Int64}})() at qpool.jl:86]

β”Œ Error: Exception while executing task on worker 3. Shutting down WorkerPool.
β”‚   e =
β”‚    MethodError: no method matching copyrec!(::SubArray{Int64, 0, Vector{Int64}, Tuple{Int64}, true}, ::Int64)
β”‚    Closest candidates are:
β”‚      copyrec!(::AbstractArray, ::AbstractArray) at /home/hipparcos/.julia/dev/DataLoaders/src/batchview.jl:112
β”‚   stacktrace = 6-element Vector{Base.StackTraces.StackFrame}: …
β”‚   args = 1
β”” @ DataLoaders ~/.julia/dev/DataLoaders/src/workerpool.jl:56

^C
ERROR: LoadError: InterruptException:
Stacktrace:
 [1] take_buffered(c::Channel{Tuple{Matrix{Float32}, Vector{Int64}}})
   @ Base ./channels.jl:389
 [2] take!
   @ ./channels.jl:383 [inlined]
 [3] take!(ringbuffer::DataLoaders.RingBuffer{Tuple{Matrix{Float32}, Vector{Int64}}})
   @ DataLoaders ~/.julia/dev/DataLoaders/src/ringbuffer.jl:55
 [4] iterate
   @ ~/.julia/dev/DataLoaders/src/loaders.jl:103 [inlined]
 [5] iterate(iterparallel::DataLoaders.BufferGetObsParallel{Tuple{Matrix{Float32}, Vector{Int64}}, DataLoaders.BatchViewCollated{ImageDataset}})
   @ DataLoaders ~/.julia/dev/DataLoaders/src/loaders.jl:89
 [6] macro expansion
   @ ~/.julia/packages/Juno/n6wyj/src/progress.jl:134 [inlined]
 [7] train!(loss::Function, ps::Zygote.Params, data::DataLoaders.BufferGetObsParallel{Tuple{Matrix{Float32}, Vector{Int64}}, DataLoaders.BatchViewCollated{ImageDataset}}, opt::Descent; cb::Flux.Optimise.var"#40#46")
   @ Flux.Optimise ~/.julia/packages/Flux/0c9kI/src/optimise/train.jl:99
 [8] train!(loss::Function, ps::Zygote.Params, data::DataLoaders.BufferGetObsParallel{Tuple{Matrix{Float32}, Vector{Int64}}, DataLoaders.BatchViewCollated{ImageDataset}}, opt::Descent)
   @ Flux.Optimise ~/.julia/packages/Flux/0c9kI/src/optimise/train.jl:97
 [9] top-level scope
   @ ~/pre_PhD/phd_pretesis/PreliminaryJuliaTests/DataLoading.jl:67
in expression starting at /home/hipparcos/pre_PhD/phd_pretesis/PreliminaryJuliaTests/DataLoading.jl:67

After that, I noticed that copyrec! is only used when there is no getobs!, so I implemented it:

function LearnBase.getobs!(buf, dataset::ImageDataset, i::Int) :: (Vector{Float32}, Int64)
  subpath = dataset.files[i]
  file = joinpath(dataset.dir, subpath)
  buf[1] = reshape(AstroImage(file).data[1],101*101)
  buf[2] = dataset.classes[i]
  buf
end

function LearnBase.getobs!(buf, dataset::ImageDataset, range::UnitRange{Int64}) :: (Vector{Float32}, Int64)
  subpath = dataset.files[range]
  file = joinpath.(dataset.dir, subpath)
  buf[1] = hcat(give_data.(AstroImage.(file))...)
  buf[2] = dataset.classes[range]
  buf
end

but then I got:

Base.StackTraces.StackFrame[(::DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}})(args::Int64) at workerpool.jl:55, macro expansion at workerpool.jl:69 [inlined], #8 at macros.jl:19 [inlined], #62 at qpool.jl:195 [inlined], (::ThreadPools.var"#58#59"{ThreadPools.var"#62#64"{DataLoaders.var"#8#14"{DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}}}}, Tuple{Int64, Int64}})() at qpool.jl:86]
Base.StackTraces.StackFrame[(::DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}})(args::Int64) at workerpool.jl:55, macro expansion at workerpool.jl:69 [inlined], #8 at macros.jl:19 [inlined], #62 at qpool.jl:195 [inlined], (::ThreadPools.var"#58#59"{ThreadPools.var"#62#64"{DataLoaders.var"#8#14"{DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}}}}, Tuple{Int64, Int64}})() at qpool.jl:86]
β”Œ Error: Exception while executing task on worker 2. Shutting down WorkerPool.
β”‚   e = MethodError: no method matching setindex!(::Tuple{SubArray{Float32, 1, Matrix{Float32}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, SubArray{Int64, 0, Vector{Int64}, Tuple{Int64}, true}}, ::Vector{Float32}, ::Int64)
β”‚   stacktrace = 6-element Vector{Base.StackTraces.StackFrame}: …
β”‚   args = 1
β”” @ DataLoaders ~/.julia/dev/DataLoaders/src/workerpool.jl:56
Base.StackTraces.StackFrame[(::DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}})(args::Int64) at workerpool.jl:55, macro expansion at workerpool.jl:69 [inlined], #8 at macros.jl:19 [inlined], #62 at qpool.jl:195 [inlined], (::ThreadPools.var"#58#59"{ThreadPools.var"#62#64"{DataLoaders.var"#8#14"{DataLoaders.var"#inloop#10"{DataLoaders.WorkerPool{Int64}}}}, Tuple{Int64, Int64}})() at qpool.jl:86]
β”Œ Error: Exception while executing task on worker 4. Shutting down WorkerPool.
β”‚   e = MethodError: no method matching setindex!(::Tuple{SubArray{Float32, 1, Matrix{Float32}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, SubArray{Int64, 0, Vector{Int64}, Tuple{Int64}, true}}, ::Vector{Float32}, ::Int64)
β”‚   stacktrace = 6-element Vector{Base.StackTraces.StackFrame}: …
β”‚   args = 2
β”” @ DataLoaders ~/.julia/dev/DataLoaders/src/workerpool.jl:56
β”Œ Error: Exception while executing task on worker 3. Shutting down WorkerPool.
β”‚   e = MethodError: no method matching setindex!(::Tuple{SubArray{Float32, 1, Matrix{Float32}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, SubArray{Int64, 0, Vector{Int64}, Tuple{Int64}, true}}, ::Vector{Float32}, ::Int64)
β”‚   stacktrace = 6-element Vector{Base.StackTraces.StackFrame}: …
β”‚   args = 3
β”” @ DataLoaders ~/.julia/dev/DataLoaders/src/workerpool.jl:56
^C
ERROR: LoadError: InterruptException:
Stacktrace:
 [1] take_buffered(c::Channel{Tuple{Matrix{Float32}, Vector{Int64}}})
   @ Base ./channels.jl:389
 [2] take!
   @ ./channels.jl:383 [inlined]
 [3] take!(ringbuffer::DataLoaders.RingBuffer{Tuple{Matrix{Float32}, Vector{Int64}}})
   @ DataLoaders ~/.julia/packages/DataLoaders/uGlPg/src/ringbuffer.jl:55
 [4] iterate
   @ ~/.julia/dev/DataLoaders/src/loaders.jl:103 [inlined]
 [5] iterate(iterparallel::DataLoaders.BufferGetObsParallel{Tuple{Matrix{Float32}, Vector{Int64}}, DataLoaders.BatchViewCollated{ImageDataset}})
   @ DataLoaders ~/.julia/packages/DataLoaders/uGlPg/src/loaders.jl:89
 [6] macro expansion
   @ ~/.julia/packages/Juno/n6wyj/src/progress.jl:134 [inlined]
 [7] train!(loss::Function, ps::Zygote.Params, data::DataLoaders.BufferGetObsParallel{Tuple{Matrix{Float32}, Vector{Int64}}, DataLoaders.BatchViewCollated{ImageDataset}}, opt::Descent; cb::Flux.Optimise.var"#40#46")
   @ Flux.Optimise ~/.julia/packages/Flux/0c9kI/src/optimise/train.jl:99
 [8] train!(loss::Function, ps::Zygote.Params, data::DataLoaders.BufferGetObsParallel{Tuple{Matrix{Float32}, Vector{Int64}}, DataLoaders.BatchViewCollated{ImageDataset}}, opt::Descent)
   @ Flux.Optimise ~/.julia/packages/Flux/0c9kI/src/optimise/train.jl:97
 [9] top-level scope
   @ ~/pre_PhD/phd_pretesis/PreliminaryJuliaTests/DataLoading.jl:84
in expression starting at /home/hipparcos/pre_PhD/phd_pretesis/PreliminaryJuliaTests/DataLoading.jl:84

There is a MWE now with collate = true and implementations of getobs! in the same repository.

I had the same issue, but collate = true solved the problem for me. Thanks @holylorenzo !