Error and Segfault with DeviceIterator for parallel DataLoader on sharded ReactantDevice

benjwweber · May 11, 2026, 1:41pm

I am trying to load a data set via a DataLoader/DeviceIterator to a sharded ReactantDevice. I have replicated it here with a mock data set. However I run into a variety of errors or segfaults if I do this. Is there anything I am doing wrong? Any work around to get this working?

cpu = cpu_device()
gpus = Reactant.devices()
ngpus = length(gpus)

mesh = Reactant.Sharding.Mesh(
    gpus,
    (:batch,)
)
device_replicated = reactant_device(
    sharding=Reactant.Sharding.Replicated(mesh)
)
device_sharded = reactant_device(
    sharding=Reactant.Sharding.DimsSharding(mesh, (-1,), (:batch,))
)

batchsize = 16 * ngpus
data = [
    (rand(Float32, 256, 256, 3, 2), rand(Bool, 16, 16, 3, 2), rand(Float32, 128, 128, 3, 4))
    for i in 1:batchsize * 100
] # mock data
dataloader = DataLoader(
    data; # dinoset;
    batchsize,
    shuffle=true,
    parallel=true,
    partial=false,
    collate=true
);

function f(dataloader, n;)
    times = [time(),]
    i = 0
    for inputs in dataloader
        i += 1
        @show i
        push!(times, time())
        if i >= n
            break
        end
    end
    return times
end

times = f(dataloader |> device_sharded, 100;)
diff(times)

This results in

julia> times = f(dataloader |> device_sharded, 100;)
i = 1
⋮
i = 25
i =
[1612749] signal 11 (128): Segmentation fault
in expression starting at REPL[50]:1
[1]    1612749 segmentation fault (core dumped)  taskset -c 11-60 julia --project -t 8

or

i = 1
⋮
i = 23
i =
[1614156] signal 11 (128): Segmentation fault
in expression starting at REPL[37]:1
_ZNK3xla4ifrt20SingleDeviceSharding4HashEN4absl12lts_202508149HashStateE at /home/bweber/.julia/artifacts/a4a178033ffe954af0f2e3ce566081781c945068/lib/libReactantExtra.so (unknown line)
ifrt_client_assemble_array_from_single_shards at /home/bweber/.julia/artifacts/a4a178033ffe954af0f2e3ce566081781c945068/lib/libReactantExtra.so (unknown line)
ifrt_client_assemble_array_from_single_shards at /home/bweber/.julia/packages/Reactant/puoGz/src/mlir/libMLIR_h.jl:14816 [inlined]
Array at /home/bweber/.julia/packages/Reactant/puoGz/src/xla/IFRT/Array.jl:105
Array at /home/bweber/.julia/packages/Reactant/puoGz/src/xla/IFRT/Array.jl:74
#AsyncArray#25 at /home/bweber/.julia/packages/Reactant/puoGz/src/xla/IFRT/AsyncArray.jl:8 [inlined]
AsyncArray at /home/bweber/.julia/packages/Reactant/puoGz/src/xla/IFRT/AsyncArray.jl:8 [inlined]
NamedSharding at /home/bweber/.julia/packages/Reactant/puoGz/src/Sharding.jl:481
unknown function (ip: 0x7f755dc49bfd) at (unknown file)
DimsSharding at /home/bweber/.julia/packages/Reactant/puoGz/src/Sharding.jl:727
#ConcreteIFRTArray#61 at /home/bweber/.julia/packages/Reactant/puoGz/src/Types.jl:388
ConcreteIFRTArray at /home/bweber/.julia/packages/Reactant/puoGz/src/Types.jl:378 [inlined]
macro expansion at /home/bweber/.julia/packages/MLDataDevices/Gr7bh/ext/ReactantExt.jl:109 [inlined]
#5 at /home/bweber/.julia/packages/Reactant/puoGz/src/Profiler.jl:295 [inlined]
#annotate#5 at /home/bweber/.julia/packages/Reactant/puoGz/src/Profiler.jl:270
annotate at /home/bweber/.julia/packages/Reactant/puoGz/src/Profiler.jl:262 [inlined]
annotate at /home/bweber/.julia/packages/Reactant/puoGz/src/Profiler.jl:262 [inlined]
adapt_storage at /home/bweber/.julia/packages/Reactant/puoGz/src/Profiler.jl:295 [inlined]
adapt_structure at /home/bweber/.julia/packages/Adapt/qhEjd/src/Adapt.jl:57 [inlined]
adapt at /home/bweber/.julia/packages/Adapt/qhEjd/src/Adapt.jl:40 [inlined]
#_#57 at ./operators.jl:1193 [inlined]
Fix at ./operators.jl:1193 [inlined]
ExcludeWalk at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:126 [inlined]
CachedWalk at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:177
recurse at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:54 [inlined]
map at ./tuple.jl:360 [inlined]
_map at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:3 [inlined]
DefaultWalk at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:73 [inlined]
ExcludeWalk at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:126 [inlined]
CachedWalk at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:177 [inlined]
execute at /home/bweber/.julia/packages/Functors/LbNAu/src/walks.jl:55 [inlined]
#fmap#32 at /home/bweber/.julia/packages/Functors/LbNAu/src/maps.jl:11 [inlined]
fmap at /home/bweber/.julia/packages/Functors/LbNAu/src/maps.jl:3 [inlined]
AbstractDevice at /home/bweber/.julia/packages/MLDataDevices/Gr7bh/src/public.jl:516
unknown function (ip: 0x7f755dc3b18d) at (unknown file)
#eachobsparallel##0 at /home/bweber/.julia/packages/MLDataDevices/Gr7bh/ext/MLUtilsExt.jl:35
unknown function (ip: 0x7f755dbc4a99) at (unknown file)
macro expansion at /home/bweber/.julia/packages/MLUtils/5jDrc/src/parallel.jl:127 [inlined]
#reducing_function#278#67 at /home/bweber/.julia/packages/FLoops/GN1Bm/src/reduce.jl:817 [inlined]
AdjoinIdentity at /home/bweber/.julia/packages/InitialValues/OWP8V/src/InitialValues.jl:306
next at /home/bweber/.julia/packages/Transducers/PqRuk/src/combinators.jl:290 [inlined]
next at /home/bweber/.julia/packages/Transducers/PqRuk/src/core.jl:287 [inlined]
macro expansion at /home/bweber/.julia/packages/Transducers/PqRuk/src/core.jl:181 [inlined]
macro expansion at /home/bweber/.julia/packages/Transducers/PqRuk/src/processes.jl:199 [inlined]
macro expansion at /home/bweber/.julia/packages/Transducers/PqRuk/src/simd.jl:41 [inlined]
_foldl_linear_bulk at /home/bweber/.julia/packages/Transducers/PqRuk/src/processes.jl:198 [inlined]
macro expansion at /home/bweber/.julia/packages/Transducers/PqRuk/src/processes.jl:192 [inlined]
macro expansion at /home/bweber/.julia/packages/Transducers/PqRuk/src/basics.jl:96 [inlined]
_foldl_array at /home/bweber/.julia/packages/Transducers/PqRuk/src/processes.jl:188 [inlined]
__foldl__ at /home/bweber/.julia/packages/Transducers/PqRuk/src/processes.jl:182 [inlined]
foldl_basecase at /home/bweber/.julia/packages/Transducers/PqRuk/src/processes.jl:361 [inlined]
_reduce_basecase at /home/bweber/.julia/packages/Transducers/PqRuk/src/threading_utils.jl:58
_reduce at /home/bweber/.julia/packages/Transducers/PqRuk/src/reduce.jl:139
_reduce at /home/bweber/.julia/packages/Transducers/PqRuk/src/reduce.jl:148
#_reduce##2 at /home/bweber/.julia/packages/Transducers/PqRuk/src/reduce.jl:147
unknown function (ip: 0x7f7bf946de3e) at (unknown file)
jl_apply at /cache/build/builder-amdci5-4/julialang/julia-release-1-dot-12/src/julia.h:2391 [inlined]
start_task at /cache/build/builder-amdci5-4/julialang/julia-release-1-dot-12/src/task.c:1252
Allocations: 120234999 (Pool: 120233519; Big: 1480); GC: 52
[1]    1614156 segmentation fault (core dumped)  taskset -c 11-60 julia --project -t 8

or

i = 1
⋮
i = 9
ERROR: ReadOnlyMemoryError()
Stacktrace:
 [1] check_channel_state
   @ ./channels.jl:187 [inlined]
 [2] take_buffered(c::Channel{Any})
   @ Base ./channels.jl:531
 [3] take!(c::Channel{Any})
   @ Base ./channels.jl:526
 [4] iterate(::MLUtils.Loader, state::MLUtils.LoaderState)
   @ MLUtils ~/.julia/packages/MLUtils/5jDrc/src/parallel.jl:143
 [5] iterate(c::DeviceIterator{typeof(identity), MLUtils.Loader}, ::Tuple{MLUtils.LoaderState, Tuple{…}})
   @ MLDataDevices ~/.julia/packages/MLDataDevices/33vyp/src/iterator.jl:61
 [6] f(dataloader::DeviceIterator{typeof(identity), MLUtils.Loader}, n::Int64)
   @ Main ./REPL[26]:11
 [7] top-level scope
   @ REPL[27]:1
Some type information was truncated. Use `show(err)` to see complete types.

I am unsure, what the underlaying problem is. Especially since the errors persist, even if the mock data fits on the device as a whole.

As it may be important the devices are:

julia> cpu = cpu_device()
(::CPUDevice{Missing}) (generic function with 1 method)

julia> gpus = Reactant.devices()
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1778503875.363267 1614630 pjrt_client.cc:569] PjRt-IFRT device count: total=1, addressable=1
I0000 00:00:1778503875.363350 1614630 pjrt_client.cc:573] Addressable PjRt-IFRT device: CpuDevice(id=0)
I0000 00:00:1778503876.431877 1614630 service.cc:154] XLA service 0x30ee8dd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1778503876.432011 1614630 service.cc:170]   StreamExecutor [0]: NVIDIA A100-SXM4-80GB, Compute Capability 8.0 (Driver: 13.0.0[570.211.1]; Runtime: 13.1.0; Toolkit: 13.1.0; DNN: 9.14.0)
I0000 00:00:1778503876.432033 1614630 service.cc:170]   StreamExecutor [1]: NVIDIA A100-SXM4-80GB, Compute Capability 8.0 (Driver: 13.0.0[570.211.1]; Runtime: 13.1.0; Toolkit: 13.1.0; DNN: 9.14.0)
I0000 00:00:1778503876.432047 1614630 service.cc:170]   StreamExecutor [2]: NVIDIA A100-SXM4-80GB, Compute Capability 8.0 (Driver: 13.0.0[570.211.1]; Runtime: 13.1.0; Toolkit: 13.1.0; DNN: 9.14.0)
I0000 00:00:1778503876.432058 1614630 service.cc:170]   StreamExecutor [3]: NVIDIA A100-SXM4-80GB, Compute Capability 8.0 (Driver: 13.0.0[570.211.1]; Runtime: 13.1.0; Toolkit: 13.1.0; DNN: 9.14.0)
I0000 00:00:1778503876.439964 1614630 se_gpu_pjrt_client.cc:1540] Using BFC allocator.
I0000 00:00:1778503876.440052 1614630 gpu_helpers.cc:144] XLA backend will use up to 85095874560 bytes on device 0 for BFCAllocator.
I0000 00:00:1778503876.440125 1614630 gpu_helpers.cc:144] XLA backend will use up to 85095874560 bytes on device 1 for BFCAllocator.
I0000 00:00:1778503876.440171 1614630 gpu_helpers.cc:144] XLA backend will use up to 85095874560 bytes on device 2 for BFCAllocator.
I0000 00:00:1778503876.440214 1614630 gpu_helpers.cc:144] XLA backend will use up to 85095874560 bytes on device 3 for BFCAllocator.
I0000 00:00:1778503876.440258 1614630 gpu_helpers.cc:183] XLA backend will use up to 0 bytes on device 0 for CollectiveBFCAllocator.
I0000 00:00:1778503876.440301 1614630 gpu_helpers.cc:183] XLA backend will use up to 0 bytes on device 1 for CollectiveBFCAllocator.
I0000 00:00:1778503876.440346 1614630 gpu_helpers.cc:183] XLA backend will use up to 0 bytes on device 2 for CollectiveBFCAllocator.
I0000 00:00:1778503876.440382 1614630 gpu_helpers.cc:183] XLA backend will use up to 0 bytes on device 3 for CollectiveBFCAllocator.
I0000 00:00:1778503876.472828 1614630 cuda_dnn.cc:461] Loaded cuDNN version 91400
I0000 00:00:1778503876.473744 1614630 pjrt_client.cc:569] PjRt-IFRT device count: total=4, addressable=4
I0000 00:00:1778503876.473768 1614630 pjrt_client.cc:573] Addressable PjRt-IFRT device: CudaDevice(id=0)
I0000 00:00:1778503876.473777 1614630 pjrt_client.cc:573] Addressable PjRt-IFRT device: CudaDevice(id=1)
I0000 00:00:1778503876.473784 1614630 pjrt_client.cc:573] Addressable PjRt-IFRT device: CudaDevice(id=2)
I0000 00:00:1778503876.473791 1614630 pjrt_client.cc:573] Addressable PjRt-IFRT device: CudaDevice(id=3)
4-element Vector{Reactant.XLA.IFRT.Device}:
 Reactant.XLA.IFRT.Device(Ptr{Nothing}(0x00000000303f1650), "CUDA:0 NVIDIA A100-SXM4-80GB")
 Reactant.XLA.IFRT.Device(Ptr{Nothing}(0x0000000032602540), "CUDA:1 NVIDIA A100-SXM4-80GB")
 Reactant.XLA.IFRT.Device(Ptr{Nothing}(0x0000000032601ca0), "CUDA:2 NVIDIA A100-SXM4-80GB")
 Reactant.XLA.IFRT.Device(Ptr{Nothing}(0x0000000032601eb0), "CUDA:3 NVIDIA A100-SXM4-80GB")

julia> ngpus = length(gpus)
4

julia> mesh = Reactant.Sharding.Mesh(
           gpus,
           (:batch,)
       )
Reactant.Sharding.Mesh{1, Vector{Int64}}([0, 1, 2, 3], [0, 1, 2, 3], (:batch,), (4,))

julia> device_replicated = reactant_device(
           sharding=Reactant.Sharding.Replicated(mesh)
       )
(::ReactantDevice{Missing, Missing, Reactant.Sharding.Replicated{Reactant.Sharding.Mesh{1, Vector{Int64}}}, Missing, Union{}}) (generic function with 1 method)

julia> device_sharded = reactant_device(
           sharding=Reactant.Sharding.DimsSharding(mesh, (-1,), (:batch,))
       )
(::ReactantDevice{Missing, Missing, Reactant.Sharding.DimsSharding{1, Tuple{Symbol}, Reactant.Sharding.Mesh{1, Vector{Int64}}}, Missing, Union{}}) (generic function with 1 method)

wsmoses · May 14, 2026, 10:28pm

cc @avikpal for lux things

Topic		Replies	Views
DataLoaders.jl Workers systematically end up outside bounds Machine Learning question	3	660	September 22, 2021
Distributed Julia, ClusterMangers package, Slurm, and CUDA on a shared file system Julia at Scale gpu , cuda , distributed	4	557	November 22, 2022
Segfault using Flux + Intel integrated GPU New to Julia flux , oneapi	2	160	February 6, 2025
Distributing LLM over multiple GPUs Machine Learning	10	584	July 26, 2023
Segmentation fault using ArrayFire and @spawn Julia at Scale	0	537	January 4, 2019

Error and Segfault with DeviceIterator for parallel DataLoader on sharded ReactantDevice

Related topics