apologies, here is the stacktrace:
[35356] signal 11 (1): Segmentation fault
in expression starting at REPL[18]:1
_ZN3NEO12BufferObject14fillExecObjectERNS_10ExecObjectEPNS_9OsContextEjj at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
__x86_return_thunk at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO12BufferObject4execEjmjbPNS_9OsContextEjjPKPS0_mPNS_10ExecObjectEmm at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO24DrmCommandStreamReceiverINS_13Gen12LpFamilyEE4execERKNS_11BatchBufferEjjj at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO24DrmCommandStreamReceiverINS_13Gen12LpFamilyEE13flushInternalERKNS_11BatchBufferERKSt6vectorIPNS_18GraphicsAllocationESaIS8_EE at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO24DrmCommandStreamReceiverINS_13Gen12LpFamilyEE5flushERNS_11BatchBufferERSt6vectorIPNS_18GraphicsAllocationESaIS7_EE at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO21CommandStreamReceiver17submitBatchBufferERNS_11BatchBufferERSt6vectorIPNS_18GraphicsAllocationESaIS5_EE at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN2L015CommandQueueImp17submitBatchBufferEmRSt6vectorIPN3NEO18GraphicsAllocationESaIS4_EEPvb at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN2L014CommandQueueHwIL14GFXCORE_FAMILY18EE26executeCommandListsRegularERNS2_27CommandListExecutionContextEjPP25_ze_command_list_handle_tP18_ze_fence_handle_tPN3NEO12LinearStreamE at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN2L014CommandQueueHwIL14GFXCORE_FAMILY18EE19executeCommandListsEjPP25_ze_command_list_handle_tP18_ze_fence_handle_tbPN3NEO12LinearStreamE at /home/*/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN25ur_queue_handle_legacy_t_18executeCommandListENSt8__detail14_Node_iteratorISt4pairIKP25_ze_command_list_handle_t22ur_command_list_info_tELb0ELb0EEEbb at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libur_adapter_level_zero.so.0 (unknown line)
_ZN25ur_queue_handle_legacy_t_19enqueueKernelLaunchEP19ur_kernel_handle_t_jPKmS3_S3_jPKP18ur_event_handle_t_PS5_ at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libur_adapter_level_zero.so.0 (unknown line)
_ZN9ur_loader21urEnqueueKernelLaunchEP18ur_queue_handle_t_P19ur_kernel_handle_t_jPKmS5_S5_jPKP18ur_event_handle_t_PS7_ at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/./libur_loader.so.0 (unknown line)
urEnqueueKernelLaunch at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/./libur_loader.so.0 (unknown line)
_ZN4sycl3_V16detail16enqueueImpKernelERKSt10shared_ptrINS1_10queue_implEERNS1_8NDRDescTERSt6vectorINS1_7ArgDescESaISA_EERKS2_INS1_18kernel_bundle_implEERKS2_INS1_11kernel_implEERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERS9_IP18ur_event_handle_t_SaISV_EERKS2_INS1_10event_implEERKSt8functionIFPvPNS1_16AccessorImplHostEEE24ur_kernel_cache_config_tbbPKNS1_19RTDeviceBinaryImageE at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_ZZN4sycl3_V17handler8finalizeEvENK3$_0clEv at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_ZN4sycl3_V17handler8finalizeEv at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_ZN4sycl3_V16detail10queue_impl15finalizeHandlerINS0_7handlerEEEvRT_RNS0_5eventE at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_ZN4sycl3_V16detail10queue_impl11submit_implERKSt8functionIFvRNS0_7handlerEEERKSt10shared_ptrIS2_ESD_SD_bRKNS1_13code_locationEPKS3_IFvbbRNS0_5eventEEE at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_ZN4sycl3_V16detail10queue_impl6submitERKSt8functionIFvRNS0_7handlerEEERKSt10shared_ptrIS2_ERKNS1_13code_locationEPKS3_IFvbbRNS0_5eventEEE at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_ZN4sycl3_V15queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_ZN6oneapi3mkl3gpu20launch_kernel_3D_usmEPiPN4sycl3_V15queueEP23mkl_gpu_kernel_struct_tP18mkl_gpu_argument_tPmSB_P20mkl_gpu_event_list_t at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
_ZN6oneapi3mkl3gpu28mkl_blas_gpu_launch_s_nocopyEPiPN4sycl3_V15queueEP23mkl_gpu_kernel_struct_tPKNS1_16CommonDriverInfoEPKNS1_17EvaluateAuxOutputEbbb17gpu_update_type_tllllPcSG_SG_mmmlllNS0_16value_or_pointerIfEESI_PvSG_bbP20mkl_gpu_event_list_t at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
_ZN6oneapi3mkl3gpu37mkl_blas_gpu_sgemm_nocopy_driver_syclEPiPN4sycl3_V15queueEPNS1_14blas_arg_usm_tEP20mkl_gpu_event_list_t at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
_ZN6oneapi3mkl3gpu30mkl_blas_gpu_sgemm_driver_syclEPiPN4sycl3_V15queueEPNS1_14blas_arg_usm_tEP20mkl_gpu_event_list_t at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
_ZN6oneapi3mkl3gpu19sgemm_sycl_internalEPN4sycl3_V15queueE10MKL_LAYOUT13MKL_TRANSPOSES7_lllNS0_16value_or_pointerIfEEPKflSB_lS9_PflNS0_4blas12compute_modeERKSt6vectorINS3_5eventESaISG_EElll at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
_ZN6oneapi3mkl3gpu10sgemm_syclEPN4sycl3_V15queueE10MKL_LAYOUT13MKL_TRANSPOSES7_lllNS0_16value_or_pointerIfEEPKflSB_lS9_PflNS0_4blas12compute_modeERKSt6vectorINS3_5eventESaISG_EElll at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
_ZN6oneapi3mkl4blas5sgemmERN4sycl3_V15queueE10MKL_LAYOUTNS0_9transposeES7_lllNS0_16value_or_pointerIfEEPKflSB_lS9_PflNS1_12compute_modeERKSt6vectorINS3_5eventESaISF_EE at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
_ZN6oneapi3mkl4blas12column_major4gemmERN4sycl3_V15queueENS0_9transposeES7_lllNS0_16value_or_pointerIfEEPKflSB_lS9_PflNS1_12compute_modeERKSt6vectorINS4_5eventESaISF_EE at /home/*/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libmkl_sycl_blas.so.5 (unknown line)
onemklSgemm at /workspace/srcdir/oneAPI.jl/deps/src/onemkl.cpp:560
onemklSgemm at /home/*/.julia/packages/oneAPI/CNvkW/lib/support/liboneapi_support.jl:331
unknown function (ip: 0x7f37182c3e8e)
gemm! at /home/*/.julia/packages/oneAPI/CNvkW/lib/mkl/wrappers_blas.jl:1212
generic_matmatmul! at /home/*/.julia/packages/oneAPI/CNvkW/lib/mkl/linalg.jl:148
generic_matmatmul! at /home/*/.julia/packages/oneAPI/CNvkW/lib/mkl/linalg.jl:123 [inlined]
_mul! at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:287 [inlined]
mul! at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:285 [inlined]
mul! at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:253 [inlined]
* at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:124 [inlined]
#551 at /home/*/.julia/packages/ChainRules/sm2ny/src/rulesets/Base/arraymath.jl:56 [inlined]
unthunk at /home/*/.julia/packages/ChainRulesCore/U6wNx/src/tangent_types/thunks.jl:213 [inlined]
unthunk at /home/*/.julia/packages/ChainRulesCore/U6wNx/src/tangent_types/thunks.jl:252 [inlined]
unthunk_tangent at /home/*/.julia/packages/Zygote/3To5I/src/compiler/chainrules.jl:3 [inlined]
map at ./tuple.jl:357 [inlined]
map at ./namedtuple.jl:266 [inlined]
unthunk_tangent at /home/*/.julia/packages/ZygoteRules/CkVIK/src/adjoint.jl:39 [inlined]
map at ./tuple.jl:357 [inlined]
unthunk_tangent at /home/*/.julia/packages/ZygoteRules/CkVIK/src/adjoint.jl:38 [inlined]
map at ./tuple.jl:355
unknown function (ip: 0x7f37021a6716)
map at ./namedtuple.jl:266 [inlined]
unthunk_tangent at /home/*/.julia/packages/ZygoteRules/CkVIK/src/adjoint.jl:39 [inlined]
map at ./tuple.jl:355 [inlined]
unthunk_tangent at /home/*/.julia/packages/ZygoteRules/CkVIK/src/adjoint.jl:38 [inlined]
tailmemaybe at /home/*/.julia/packages/Zygote/3To5I/src/compiler/interface.jl:40
unknown function (ip: 0x7f37021a6495)
#88 at /home/*/.julia/packages/Zygote/3To5I/src/compiler/interface.jl:97
unknown function (ip: 0x7f3702168fc6)
withgradient at /home/*/.julia/packages/Zygote/3To5I/src/compiler/interface.jl:219
#withgradient#5 at /home/*/.julia/packages/Flux/BkG8S/src/gradient.jl:182 [inlined]
withgradient at /home/*/.julia/packages/Flux/BkG8S/src/gradient.jl:169
unknown function (ip: 0x7f3702972af6)
jl_apply at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/julia.h:2157 [inlined]
do_call at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/interpreter.c:126
eval_value at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/interpreter.c:223
eval_stmt_value at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/interpreter.c:174 [inlined]
eval_body at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/interpreter.c:663
jl_interpret_toplevel_thunk at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/interpreter.c:821
jl_toplevel_eval_flex at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/toplevel.c:943
jl_toplevel_eval_flex at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/toplevel.c:886
ijl_toplevel_eval_in at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/toplevel.c:994
eval at ./boot.jl:430 [inlined]
eval_user_input at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:245
repl_backend_loop at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:342
#start_repl_backend#59 at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:327
start_repl_backend at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:324
#run_repl#72 at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:483
run_repl at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:469
jfptr_run_repl_10097.1 at /home/*/.julia/juliaup/julia-1.11.3+0.x64.linux.gnu/share/julia/compiled/v1.11/REPL/u0gqU_XvZAg.so (unknown line)
#1150 at ./client.jl:446
jfptr_YY.1150_14693.1 at /home/*/.julia/juliaup/julia-1.11.3+0.x64.linux.gnu/share/julia/compiled/v1.11/REPL/u0gqU_XvZAg.so (unknown line)
jl_apply at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/julia.h:2157 [inlined]
jl_f__call_latest at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/builtins.c:875
#invokelatest#2 at ./essentials.jl:1055 [inlined]
invokelatest at ./essentials.jl:1052 [inlined]
run_main_repl at ./client.jl:430
repl_main at ./client.jl:567 [inlined]
_start at ./client.jl:541
jfptr__start_73609.1 at /home/*/.julia/juliaup/julia-1.11.3+0.x64.linux.gnu/lib/julia/sys.so (unknown line)
jl_apply at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/julia.h:2157 [inlined]
true_main at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/jlapi.c:900
jl_repl_entrypoint at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/jlapi.c:1059
main at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/cli/loader_exe.c:58
unknown function (ip: 0x7f37ab565d8f)
__libc_start_main at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
unknown function (ip: 0x4010b8)
Allocations: 153102696 (Pool: 153097790; Big: 4906); GC: 75
[1] 35356 segmentation fault (core dumped) julia
the Julia code (from the linked Flux docs):
# Install everything, including CUDA, and load packages:
using Flux, Statistics, ProgressMeter
using oneAPI # using CUDA in original version
device = gpu_device() # function to move data and model to the GPU
# Generate some data for the XOR problem: vectors of length 2, as columns of a matrix:
noisy = rand(Float32, 2, 1000) # 2×1000 Matrix{Float32}
truth = [xor(col[1]>0.5, col[2]>0.5) for col in eachcol(noisy)] # 1000-element Vector{Bool}
# Define our model, a multi-layer perceptron with one hidden layer of size 3:
model = Chain(
Dense(2 => 3, tanh), # activation function inside layer
BatchNorm(3),
Dense(3 => 2)) |> device # move model to GPU, if one is available
# The model encapsulates parameters, randomly initialised. Its initial output is:
out1 = model(noisy |> device) # 2×1000 Matrix{Float32}, or CuArray{Float32}
probs1 = softmax(out1) |> cpu # normalise to get probabilities (and move off GPU)
# To train the model, we use batches of 64 samples, and one-hot encoding:
target = Flux.onehotbatch(truth, [true, false]) # 2×1000 OneHotMatrix
loader = Flux.DataLoader((noisy, target), batchsize=64, shuffle=true);
opt_state = Flux.setup(Flux.Adam(0.01), model) # will store optimiser momentum, etc.
# Training loop, using the whole data set 1000 times:
losses = []
@showprogress for epoch in 1:1_000
for xy_cpu in loader
# Unpack batch of data, and move to GPU:
x, y = xy_cpu |> device
loss, grads = Flux.withgradient(model) do m
# Evaluate model and loss inside gradient context:
y_hat = m(x)
Flux.logitcrossentropy(y_hat, y)
end
Flux.update!(opt_state, model, grads[1])
push!(losses, loss) # logging, outside gradient context
end
end
basic functions like oneArray
and oneAPI.versioninfo()
work fine, and gpu_device
returns the oneAPI device as expected. The above runs (around 8x slower than CPU though) if I change the batchsize from 64 to 2.