SymEngine not working in parallel

parallel

#1

I am attempting to run a code I wrote using the SymEngine package in parallel using the @parallel for-loop, but am getting errors I don’t understand. I have written a simple test script that reproduces a similar error when trying to run SymEngine in parallel.

My TestSymEngine.jl script contains:

#Confirm number of cores for parallel test
println("Running on ", nprocs(), " cores")

#On all cores => load SymEngine and create symbolic variable r
@everywhere using SymEngine
@everywhere @vars r
@everywhere SymE = SymEngine.Basic

#Serial arrays
A = Array{Int}([1, 1, 1, 1, 1, 1])
B = Array{Int}(6)
C = Array{SymE}(6)

#Parallel arrays
Ap = SharedArray{Int}([1, 1, 1, 1, 1, 1])
Bp = SharedArray{Int}(6)
Cp = SharedArray{Int}(6)

#Print A arrays
println("A = $A")
println("Ap = $Ap")

#Test serial for-loop without SymEngine
for i=1:6
  B[i] = A[i] * i
end
println("B = $B")

#Test parallel for-loop without SymEngine
@sync @parallel for i=1:6
  Bp[i] = Ap[i] * i
end
println("Bp = $Bp")

#Test serial for-loop with SymEngine
for i=1:6
  C[i] = B[i] * r
end
println("C = $C")

#Test parallel for-loop with SymEngine
let
  local C1 = C
  @everywhere CC = $C1
end
@everywhere C1 = Int(0)
@sync @parallel for i=1:6
  C2 = CC[i] / r
  Cp[i] = C2
end
println("Cp = $Cp")

I then call the Julia REPL from a command prompt and run this script using include(). I get the following results/errors:

browning@batch28:~> julia -p 1
               _
   _       _ _(_)_     |  A fresh approach to technical computing
  (_)     | (_) (_)    |  Documentation: https://docs.julialang.org
   _ _   _| |_  __ _   |  Type "?help" for help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 0.6.2 (2017-12-13 18:08 UTC)
 _/ |\__'_|_|_|\__'_|  |  Official http://julialang.org/ release
|__/                   |  x86_64-pc-linux-gnu

julia> include("TestSymEngine.jl")
Running on 2 cores
A = [1, 1, 1, 1, 1, 1]
Ap = [1, 1, 1, 1, 1, 1]
B = [1, 2, 3, 4, 5, 6]
Bp = [1, 2, 3, 4, 5, 6]
C = SymEngine.Basic[r, 2*r, 3*r, 4*r, 5*r, 6*r]

signal (11): Segmentation fault
while loading no file, in expression starting on line 0
_ZN9SymEngine3divERKNS_3RCPIKNS_5BasicEEES5_ at /p/home/browning/.julia/v0.6/Conda/deps/usr/lib/libsymengine.so.0.3 (unknown line)
basic_div at /p/home/browning/.julia/v0.6/Conda/deps/usr/lib/libsymengine.so.0.3 (unknown line)
/ at /p/home/browning/.julia/v0.6/SymEngine/src/mathops.jl:16
unknown function (ip: 0x7f0486f58716)
jl_call_fptr_internal at /buildworker/worker/package_linux64/build/src/julia_internal.h:339 [inlined]
jl_call_method_internal at /buildworker/worker/package_linux64/build/src/julia_internal.h:358 [inlined]
jl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:1926
macro expansion at /p/home/browning/TestSymEngine.jl:48 [inlined]
#23 at ./distributed/macros.jl:174
#158 at ./distributed/macros.jl:20
unknown function (ip: 0x7f0486f5830f)
jl_call_fptr_internal at /buildworker/worker/package_linux64/build/src/julia_internal.h:339 [inlined]
jl_call_method_internal at /buildworker/worker/package_linux64/build/src/julia_internal.h:358 [inlined]
jl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:1926
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1424 [inlined]
jl_f__apply at /buildworker/worker/package_linux64/build/src/builtins.c:426
#103 at ./distributed/process_messages.jl:264 [inlined]
run_work_thunk at ./distributed/process_messages.jl:56
run_work_thunk at ./distributed/process_messages.jl:65 [inlined]
#96 at ./event.jl:73
unknown function (ip: 0x7f0486f4d41f)
jl_call_fptr_internal at /buildworker/worker/package_linux64/build/src/julia_internal.h:339 [inlined]
jl_call_method_internal at /buildworker/worker/package_linux64/build/src/julia_internal.h:358 [inlined]
jl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:1926
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1424 [inlined]
start_task at /buildworker/worker/package_linux64/build/src/task.c:267
unknown function (ip: 0xffffffffffffffff)
Allocations: 2497544 (Pool: 2496326; Big: 1218); GC: 3
Worker 2 terminated.ERROR:
LoadError: ProcessExitedException()ERROR (unhandled task failure): read: connection reset by peer (ECONNRESET)

Stacktrace:
 [1] try_yieldto(::Base.##296#297{Task}, ::Task) at ./event.jl:189
 [2] wait() at ./event.jl:234
 [3] wait(::Condition) at ./event.jl:27
 [4] wait_impl(::Channel{Any}) at ./channels.jl:364
 [5] wait(::Channel{Any}) at ./channels.jl:360
 [6] take_buffered at ./channels.jl:319 [inlined]
 [7] take!(::Channel{Any}) at ./channels.jl:317
 [8] #remotecall_fetch#141(::Array{Any,1}, ::Function, ::Function, ::Base.Distributed.Worker, ::Base.Distributed.RRID, ::Vararg{Any,N} where N) at ./distributed/remotecall.jl:350
 [9] remotecall_fetch(::Function, ::Base.Distributed.Worker, ::Base.Distributed.RRID, ::Vararg{Any,N} where N) at ./distributed/remotecall.jl:346
 [10] #remotecall_fetch#144(::Array{Any,1}, ::Function, ::Function, ::Int64, ::Base.Distributed.RRID, ::Vararg{Any,N} where N) at ./distributed/remotecall.jl:367
 [11] remotecall_fetch(::Function, ::Int64, ::Base.Distributed.RRID, ::Vararg{Any,N} where N) at ./distributed/remotecall.jl:367
 [12] call_on_owner(::Function, ::Future, ::Int64, ::Vararg{Int64,N} where N) at ./distributed/remotecall.jl:440
 [13] wait(::Future) at ./distributed/remotecall.jl:455
 [14] sync_end() at ./task.jl:274
 [15] include_from_node1(::String) at ./loading.jl:576
 [16] include(::String) at ./sysimg.jl:14
while loading /p/home/browning/TestSymEngine.jl, in expression starting on line 303

julia>

Now it gets a little weird… by shear chance, I tried running it again with no changes whatsoever. The following is exactly following the above in the same REPL session with no editing:

julia> include("TestSymEngine.jl")
Running on 1 cores
A = [1, 1, 1, 1, 1, 1]
Ap = [1, 1, 1, 1, 1, 1]
B = [1, 2, 3, 4, 5, 6]
Bp = [1, 2, 3, 4, 5, 6]
C = SymEngine.Basic[r, 2*r, 3*r, 4*r, 5*r, 6*r]
Cp = [1, 2, 3, 4, 5, 6]

julia>

This is the output I would expect from the code, but what is happening that’s causing it to crash on the first pass through but then run on a second try?

Any help is MOST appreciated.


#2

I think it might be the let @everywhere causing trouble
one possibility from the repl help
?> let statements allocate new variable bindings each time they run
?> @everywhere is equiv to calling eval(Main, expr) on all processes
So there may be different bindings/memory locations for the CC var per process (Doesn’t sound too bad, unsure)
The other thing it says is that @everywhere won’t capture local vars without @eval


#3

Did you ever solve this?

Running into the same issue while using pmap


here’s my error log:

Progress:   0%|                                         |  ETA: N/A
signal (11): Segmentation fault: 11
while loading no file, in expression starting on line 0
_ZN9SymEngine3divERKNS_3RCPIKNS_5BasicEEES5_ at /Users/dan/.julia/v0.6/SymEngine/deps/symengine-0.3/lib/libsymengine.0.3.0.dylib (unknown line)
basic_div at /Users/dan/.julia/v0.6/SymEngine/deps/symengine-0.3/lib/libsymengine.0.3.0.dylib (unknown line)
/ at /Users/dan/.julia/v0.6/SymEngine/src/mathops.jl:16
/ at ./promotion.jl:252
unknown function (ip: 0x11b4c294a)
R_0_from_beta at /Users/dan/.julia/v0.6/Fussy/src/methods/reactors/constraints/r_0_from_beta.jl:6
unknown function (ip: 0x11b4c6392)
calc_R_0 at /Users/dan/.julia/v0.6/Fussy/src/methods/reactors/calculations/calc_r_0.jl:8
#89 at /Users/dan/.julia/v0.6/Fussy/src/methods/reactors/match.jl:23
work_f at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:11
unknown function (ip: 0x11b4b5996)
jl_apply at /Users/osx/buildbot/slave/package_osx64/build/src/./julia.h:1424 [inlined]
jl_f__apply at /Users/osx/buildbot/slave/package_osx64/build/src/builtins.c:426
broadcast_t at ./broadcast.jl:258
broadcast_c at ./broadcast.jl:321 [inlined]
broadcast at ./broadcast.jl:455 [inlined]
find_bisection_roots at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:132
unknown function (ip: 0x11b4b29f9)
find_root_list at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:118
unknown function (ip: 0x11b48d189)
#_find_recursive_roots#3 at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:39
#_find_recursive_roots at ./<missing>:0
#find_roots#1 at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:13
#find_roots at ./<missing>:0
_match at /Users/dan/.julia/v0.6/Fussy/src/methods/reactors/match.jl:41
unknown function (ip: 0x11b4a258d)
match at /Users/dan/.julia/v0.6/Fussy/src/methods/reactors/match.jl:35
#hone#94 at /Users/dan/.julia/v0.6/Fussy/src/methods/reactors/hone.jl:14
unknown function (ip: 0x11b4a1530)
#76 at /Users/dan/.julia/v0.6/Fussy/src/structs/calibration.jl:51
work_f at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:11
unknown function (ip: 0x11b49fc96)
jl_apply at /Users/osx/buildbot/slave/package_osx64/build/src/./julia.h:1424 [inlined]
jl_f__apply at /Users/osx/buildbot/slave/package_osx64/build/src/builtins.c:426
broadcast_t at ./broadcast.jl:258
broadcast_c at ./broadcast.jl:321 [inlined]
broadcast at ./broadcast.jl:455 [inlined]
find_bisection_roots at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:132
unknown function (ip: 0x11b48f529)
find_root_list at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:118
unknown function (ip: 0x11b48d189)
#_find_recursive_roots#3 at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:39
#_find_recursive_roots at ./<missing>:0
#find_roots#1 at /Users/dan/.julia/v0.6/Fussy/src/utils/find_roots.jl:13
#find_roots at ./<missing>:0
#75 at /Users/dan/.julia/v0.6/Fussy/src/structs/calibration.jl:65
unknown function (ip: 0x11b487225)
jl_apply at /Users/osx/buildbot/slave/package_osx64/build/src/./julia.h:1424 [inlined]
jl_f__apply at /Users/osx/buildbot/slave/package_osx64/build/src/builtins.c:426
#2 at /Users/dan/.julia/v0.6/PmapProgressMeter/src/PmapProgressMeter.jl:34
jl_apply at /Users/osx/buildbot/slave/package_osx64/build/src/./julia.h:1424 [inlined]
jl_f__apply at /Users/osx/buildbot/slave/package_osx64/build/src/builtins.c:426
#106 at ./distributed/process_messages.jl:268 [inlined]
run_work_thunk at ./distributed/process_messages.jl:56
macro expansion at ./distributed/process_messages.jl:268 [inlined]
#105 at ./event.jl:73
jl_apply at /Users/osx/buildbot/slave/package_osx64/build/src/./julia.h:1424 [inlined]
start_task at /Users/osx/buildbot/slave/package_osx64/build/src/task.c:267
Allocations: 27118571 (Pool: 27115554; Big: 3017); GC: 58
ProcessExitedException()

edit: I’m guessing it’s some type promotion issue involving basic_div

i.e.

cur_var = 1.0 /  SymEngine.symbols("cur_sym")

#4

Don’t really know what happened here. It seems like the r value in the initial post is defined outside scope.

The way to “fix” it would be changing:

@sync @parallel for i=1:6
  C2 = CC[i] / r
  Cp[i] = C2
end

to

@sync @parallel for i=1:6
  C2 = CC[i] / symbols(:r)
  Cp[i] = C2
end

(personally, I wouldn’t really call this a fix)


In my situation, it involved changing multiple files according to the same pattern as above.

Albeit very ugly, it works :confused:

This would involve changing all functions like:

function a(cur_reactor::AbstractReactor)
  cur_a = cur_reactor.epsilon
  cur_a *= cur_reactor.R_0
  cur_a
end

to:

function safe_symbol(cur_reactor::AbstractReactor, cur_field::Symbol)
  cur_value = getfield(cur_reactor, cur_field)
  isa(cur_value, SymEngine.Basic) || return cur_value
  return symbols(cur_field)
end
function a(cur_reactor::AbstractReactor)
  cur_a = cur_reactor.epsilon
  cur_a *= safe_symbol(cur_reactor, :R_0)
  cur_a
end

// total, i had to make 10+ of these changes throughout my code