Hi All,

I was implementing a distributed conjugate gradient method in the course of my research and have run into a numerical issue when distributing the code to multiple processes. Algorithmically, my single process and multi-process versions of code should produce the exact same values, but on my main work desktop (Intel 5930k, Ubuntu 16.04, Julia 0.6) there are numerical differences when I run the code async in one process vs. in two processes.

The test code is as follows. It is a little long, but I cannot reduce further without affecting the reproducibility.

```
NTEST = 1
addprocs(NTEST; topology=:master_slave)
@everywhere function par_cg_solver{T<:AbstractFloat}(fim::Matrix{T},
vpg::Vector{T},
toworkers,
tomaster,
cg_iters::Integer=10,
id::Int=max(myid()-1, 1))
put!(tomaster[id], vpg) # send vpg
x = zeros(T, size(vpg))
vpg[:] = take!(toworkers[id])
r = copy(vpg)
p = copy(vpg)
rdr = dot(vpg, vpg) # outputs a scalar
z = fim*p
put!(tomaster[id], copy(z))
for i=1:cg_iters
z = take!( toworkers[id] )
v = rdr/dot(p, z) # scalar
x += v*p
if i == cg_iters
break
end
r -= v*z
rdr_new = dot(r, r) # scalar
ratio = rdr_new/rdr
rdr = rdr_new
p .*= ratio
p += r
A_mul_B!(z, fim, p)
put!(tomaster[id], copy(z))
end
put!(tomaster[id], x) # send npg for sync
x[:] = take!(toworkers[id])
return x
end
function scatter{T<:AbstractFloat}(tomaster, val::Vector{T}, nworkers::Int)
for i=1:nworkers
put!(tomaster[i], copy(val))
end
end
function gather!{T<:AbstractFloat}(toworkers, val::Vector{T}, nworkers::Int)
val[:] .= T(0.0)
for i=1:nworkers
val[:] += take!(toworkers[i])
end
val[:] /= nworkers
end
function par_cg_manager(nparam::Int,
nworkers::Int,
toworkers,
tomaster,
cg_iters::Int=10)
vpg = zeros(nparam)
z = zeros(nparam)
gather!(tomaster, vpg, nworkers)
scatter(toworkers, vpg, nworkers)
for i=1:cg_iters
gather!(tomaster, z, nworkers)
scatter(toworkers, z, nworkers)
end
gather!(tomaster, z, nworkers)
scatter(toworkers, z, nworkers)
return vpg, z # z == npg
end
function cpu_cg_solve{T<:AbstractFloat}(fim::Matrix{T},
vpg::Vector{T},
cg_iters::Int=10)
# Initialize cg variables
r = copy(vpg)
p = copy(vpg)
x = zeros(T, size(vpg))
rdr = dot(vpg, vpg) # outputs a scalar
z = fim*p
for i=1:cg_iters
v = rdr/dot(p, z) # scalar
x += v*p
r -= v*z
rdr_new = dot(r, r) # scalar
ratio = rdr_new/rdr
rdr = rdr_new
p[:] .*= ratio
p[:] += r
A_mul_B!(z, fim, p)
end
return x
end
nparam = 512 #128
T = Float32
iters = 10 #100
const get_cg_z = [ RemoteChannel(()->Channel{Vector{T}}(1)) for i=1:(NTEST) ]
const send_cg_z = [ RemoteChannel(()->Channel{Vector{T}}(1)) for i=1:(NTEST) ]
# start work threads for par cg
srand(12345)
A = rand(T, nparam, nparam)
b = rand(T, nparam)
x1 = zeros(T, nparam, NTEST)
x2= zeros(T, nparam, NTEST)
A = A*A' # make A matrix psd
x = cpu_cg_solve(A, b, iters) # test cpu
println("1 Process, Async")
@sync begin
@async par_cg_manager(nparam, NTEST, get_cg_z, send_cg_z, iters)
for i=1:NTEST
@async x1[:, i] = par_cg_solver(A, b, get_cg_z, send_cg_z, iters, i)
end
end
x1 = mean(x1, 2)
println(" Maximum Diff: ", maximum(x-x1) )
diff = maximum(x-x1)
if diff != 0.0
print_with_color(:red, " BAD Value\n")
end
println("$(NTEST+1) Process")
@sync begin
@async par_cg_manager(nparam, NTEST, get_cg_z, send_cg_z, iters)
for i=1:NTEST
@async x2[:, i] = remotecall_fetch(par_cg_solver, i+1, A, b, get_cg_z, send_cg_z, iters)
end
end
x2 = mean(x2, 2)
println(" Maximum Diff: ", maximum(x-x2) )
diff = maximum(x-x2)
if diff != 0.0
print_with_color(:red, " BAD Value\n")
end
```

Which on the affected machine produces this output:

```
1 Process, Async
Maximum Diff: 0.0
2 Process
Maximum Diff: 0.018099353
BAD Value
```

I would expect the differences between the too procedures to be no different considering it’s calling the same functions just in different ways. There is a single process (no communication version of CG included as well.

I cannot reproduce this bug on my laptop and on a few other servers, but a different machine with the same 5930k processor as well as a machine with a 3930k processor both have the same issue. At this point I’m not sure if this is a bug in the math libraries affecting these processors, or some issue with multi-process serialization. When I change the nparam variable to much smaller values, this does not trigger the bug on my main machine, which is a clue, but still does not indicate a search direction. Julia 0.5 on the same machine does not trigger this, and the same code on a Mac (different processor) also does not trigger this.

Any guidance or help would be appreciated; I’ve found that this part of the code is affecting the rest of my research, so I’m glad to have narrowed it down to this point. It seems to be critical enough that it’s producing erroneous code when algorithmically this shouldn’t be the case…

To summarize my current thinking, the issue is caused by:

- Linear Algebra functions that call processor dependent functionality
- remote process serialization issue, but only on specific processors?
- combination of the two, or something else?