Hi,
I have some issues while using multi-threading with MPI. For multi-threading, I am using the Floops
package.
function func(mag, l, J, ϵ, nphi, ens)
basis = buildBasis(l)
N = 2^l; η = binomial(l, Int(l/2))
# avgEE = similar(mag)
stdEE = similar(mag)
for (i,h) in enumerate(mag)
H_init = buildHNonRnd(η, l, basis, J) + buildHRnd(h, basis, η, l)
e0, = eigs(H_init; nev=1, which=:SM, ritzvec=false)
emax, = eigs(H_init; nev=1, which=:LM, ritzvec=false)
e = (e0[1] - emax[1]) * ϵ + emax[1]
H_0 = buildHNonRnd(η, l, basis, J)
ee = Vector{Float64}(undef, ens)
@floop for m in 1:ens
H = H_0 + buildHRnd(h, basis, η, l)
ϕ = shiftInverse(H, e, nphi)
ee[m] = getEE(ϕ, N, η, l, nphi)
end
# avgEE[i] = mean(ee)
stdEE[i] = std(ee)
print("completed h = $h \n")
flush(stdout)
end
return stdEE
end
Also, I have slightly changed the MPI part.
function main()
L = 4
sampleSize = 500
h_arr = round.(LinRange(0.1, 8, 30), digits=3)
J = 1.0; ϵ = 0.5; nphi = 5
MPI.Init()
comm = MPI.COMM_WORLD
size = MPI.Comm_size(comm)
rank = MPI.Comm_rank(comm)
magLenNodes = div(length(h_arr), size)
send_buf = zeros(magLenNodes)
magRec = MPI.Scatter!(MPI.UBuffer(h_arr, magLenNodes), send_buf, 0, comm)
magRec = func(send_buf, L, J, ϵ, nphi, sampleSize)
recv_msg = MPI.Gather(magRec, 0, comm)
if rank == 0
print("completed, $recv_msg \n")
end
# end
MPI.Barrier(comm)
end
I run the code with JULIA_NUM_THREADS=4
and mpiexec -n 2 julia example.jl
, and got this error:
signal (11): Segmentation fault
signal (11): Segmentation fault
in expression starting at /scratch/20ph92r03/juliafiles/proj_2/mblPTmpi.jl:314
in expression starting at /scratch/20ph92r03/juliafiles/proj_2/mblPTmpi.jl:314
jl_gc_alloc at /home/20ph92r03/julia-1.8.0/bin/../lib/julia/libjulia-internal.so.1 (unknown line)
ijl_alloc_array_1d at /home/20ph92r03/julia-1.8.0/bin/../lib/julia/libjulia-internal.so.1 (unknown line)
Array at ./boot.jl:459 [inlined]
Array at ./boot.jl:468 [inlined]
similar at ./array.jl:378 [inlined]
similar at ./subarray.jl:65 [inlined]
similar at ./abstractarray.jl:797 [inlined]
* at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:101 [inlined]
orthogonalize! at /home/20ph92r03/.julia/packages/ArnoldiMethod/JdEiw/src/expansion.jl:89
iterate_arnoldi! at /home/20ph92r03/.julia/packages/ArnoldiMethod/JdEiw/src/expansion.jl:123
_partialschur at /home/20ph92r03/.julia/packages/ArnoldiMethod/JdEiw/src/run.jl:185
jl_gc_state_set at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/julia_threads.h:340 [inlined]
#partialschur#1 at /home/20ph92r03/.julia/packages/ArnoldiMethod/JdEiw/src/run.jl:106
maybe_collect at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/julia_threads.h:333 [inlined]
jl_gc_pool_alloc_inner at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/gc.c:1247 [inlined]
partialschur##kw at /home/20ph92r03/.julia/packages/ArnoldiMethod/JdEiw/src/run.jl:94 [inlined]
shiftInverse at /scratch/20ph92r03/juliafiles/proj_2/mblPTmpi.jl:164
jl_gc_pool_alloc_noinline at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/gc.c:1306 [inlined]
macro expansion at /scratch/20ph92r03/juliafiles/proj_2/mblPTmpi.jl:217 [inlined]
__##reducing_function#313 at /home/20ph92r03/.julia/packages/FLoops/3ZEuy/src/reduce.jl:817 [inlined]
AdjoinIdentity at /home/20ph92r03/.julia/packages/InitialValues/OWP8V/src/InitialValues.jl:306
jl_gc_alloc_ at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/julia_internal.h:369 [inlined]
jl_gc_alloc at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/gc.c:3371
next at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/combinators.jl:290 [inlined]
next at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/core.jl:289 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/core.jl:181 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:199 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/simd.jl:41 [inlined]
_foldl_linear_bulk at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:198 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:192 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/basics.jl:115 [inlined]
_foldl_array at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:188 [inlined]
__foldl__ at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:182 [inlined]
foldl_basecase at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:365 [inlined]
_reduce_basecase at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/threading_utils.jl:56
_reduce at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/reduce.jl:150
#167 at ./threadingconstructs.jl:258
unknown function (ip: 0x2ad40bc67caf)
_new_array_ at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/array.c:134 [inlined]
_new_array at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/array.c:198 [inlined]
ijl_alloc_array_1d at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/array.c:436
_jl_invoke at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/gf.c:2367 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/gf.c:2549
Array at ./boot.jl:459 [inlined]
Array at ./boot.jl:468 [inlined]
similar at ./array.jl:378 [inlined]
similar at ./abstractarray.jl:797 [inlined]
getrf! at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/LinearAlgebra/src/lapack.jl:563
#lu!#172 at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/LinearAlgebra/src/lu.jl:81 [inlined]
lu!##kw at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/LinearAlgebra/src/lu.jl:80 [inlined]
#lu#179 at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/LinearAlgebra/src/lu.jl:279 [inlined]
lu at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/LinearAlgebra/src/lu.jl:278 [inlined]
lu at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/LinearAlgebra/src/lu.jl:278 [inlined]
shiftInverse at /scratch/20ph92r03/juliafiles/proj_2/mblPTmpi.jl:160
macro expansion at /scratch/20ph92r03/juliafiles/proj_2/mblPTmpi.jl:217 [inlined]
__##reducing_function#313 at /home/20ph92r03/.julia/packages/FLoops/3ZEuy/src/reduce.jl:817 [inlined]
AdjoinIdentity at /home/20ph92r03/.julia/packages/InitialValues/OWP8V/src/InitialValues.jl:306
next at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/combinators.jl:290 [inlined]
next at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/core.jl:289 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/core.jl:181 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:199 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/simd.jl:41 [inlined]
_foldl_linear_bulk at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:198 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:192 [inlined]
macro expansion at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/basics.jl:115 [inlined]
_foldl_array at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:188 [inlined]
__foldl__ at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:182 [inlined]
foldl_basecase at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/processes.jl:365 [inlined]
_reduce_basecase at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/threading_utils.jl:56
_reduce at /home/20ph92r03/.julia/packages/Transducers/HBMTc/src/reduce.jl:150
#167 at ./threadingconstructs.jl:258
unknown function (ip: 0x2ad40bc67caf)
_jl_invoke at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/gf.c:2367 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-3/julialang/julia-release-1-dot-8/src/gf.c:2549
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= RANK 1 PID 53989 RUNNING AT cn039
= KILLED BY SIGNAL: 11 (Segmentation fault)
===================================================================================
I read that using Multi-threading has known issues in the MPI.jl
documentation (Known issues · MPI.jl). However, the fix prescribed there doesn’t work for me. I even tried setting the environment variable UCX_ERROR_SIGNALS=""
, but it crashes.
Can you please suggest something which I can try? It works without multi-threading though!