Hi All,

I was implementing a distributed conjugate gradient method in the course of my research and have run into a numerical issue when distributing the code to multiple processes. Algorithmically, my single process and multi-process versions of code should produce the exact same values, but on my main work desktop (Intel 5930k, Ubuntu 16.04, Julia 0.6) there are numerical differences when I run the code async in one process vs. in two processes.

The test code is as follows. It is a little long, but I cannot reduce further without affecting the reproducibility.

```
NTEST = 1
addprocs(NTEST; topology=:master_slave)
@everywhere function par_cg_solver{T<:AbstractFloat}(fim::Matrix{T},
vpg::Vector{T},
toworkers,
tomaster,
cg_iters::Integer=10,
id::Int=max(myid()-1, 1))
put!(tomaster[id], vpg) # send vpg
x = zeros(T, size(vpg))
vpg[:] = take!(toworkers[id])
r = copy(vpg)
p = copy(vpg)
rdr = dot(vpg, vpg) # outputs a scalar
z = fim*p
put!(tomaster[id], copy(z))
for i=1:cg_iters
z = take!( toworkers[id] )
v = rdr/dot(p, z) # scalar
x += v*p
if i == cg_iters
break
end
r -= v*z
rdr_new = dot(r, r) # scalar
ratio = rdr_new/rdr
rdr = rdr_new
p .*= ratio
p += r
A_mul_B!(z, fim, p)
put!(tomaster[id], copy(z))
end
put!(tomaster[id], x) # send npg for sync
x[:] = take!(toworkers[id])
return x
end
function scatter{T<:AbstractFloat}(tomaster, val::Vector{T}, nworkers::Int)
for i=1:nworkers
put!(tomaster[i], copy(val))
end
end
function gather!{T<:AbstractFloat}(toworkers, val::Vector{T}, nworkers::Int)
val[:] .= T(0.0)
for i=1:nworkers
val[:] += take!(toworkers[i])
end
val[:] /= nworkers
end
function par_cg_manager(nparam::Int,
nworkers::Int,
toworkers,
tomaster,
cg_iters::Int=10)
vpg = zeros(nparam)
z = zeros(nparam)
gather!(tomaster, vpg, nworkers)
scatter(toworkers, vpg, nworkers)
for i=1:cg_iters
gather!(tomaster, z, nworkers)
scatter(toworkers, z, nworkers)
end
gather!(tomaster, z, nworkers)
scatter(toworkers, z, nworkers)
return vpg, z # z == npg
end
function cpu_cg_solve{T<:AbstractFloat}(fim::Matrix{T},
vpg::Vector{T},
cg_iters::Int=10)
# Initialize cg variables
r = copy(vpg)
p = copy(vpg)
x = zeros(T, size(vpg))
rdr = dot(vpg, vpg) # outputs a scalar
z = fim*p
for i=1:cg_iters
v = rdr/dot(p, z) # scalar
x += v*p
r -= v*z
rdr_new = dot(r, r) # scalar
ratio = rdr_new/rdr
rdr = rdr_new
p[:] .*= ratio
p[:] += r
A_mul_B!(z, fim, p)
end
return x
end
nparam = 512 #128
T = Float32
iters = 10 #100
const get_cg_z = [ RemoteChannel(()->Channel{Vector{T}}(1)) for i=1:(NTEST) ]
const send_cg_z = [ RemoteChannel(()->Channel{Vector{T}}(1)) for i=1:(NTEST) ]
# start work threads for par cg
srand(12345)
A = rand(T, nparam, nparam)
b = rand(T, nparam)
x1 = zeros(T, nparam, NTEST)
x2= zeros(T, nparam, NTEST)
A = A*A' # make A matrix psd
x = cpu_cg_solve(A, b, iters) # test cpu
println("1 Process, Async")
@sync begin
@async par_cg_manager(nparam, NTEST, get_cg_z, send_cg_z, iters)
for i=1:NTEST
@async x1[:, i] = par_cg_solver(A, b, get_cg_z, send_cg_z, iters, i)
end
end
x1 = mean(x1, 2)
println(" Maximum Diff: ", maximum(x-x1) )
diff = maximum(x-x1)
if diff != 0.0
print_with_color(:red, " BAD Value\n")
end
println("$(NTEST+1) Process")
@sync begin
@async par_cg_manager(nparam, NTEST, get_cg_z, send_cg_z, iters)
for i=1:NTEST
@async x2[:, i] = remotecall_fetch(par_cg_solver, i+1, A, b, get_cg_z, send_cg_z, iters)
end
end
x2 = mean(x2, 2)
println(" Maximum Diff: ", maximum(x-x2) )
diff = maximum(x-x2)
if diff != 0.0
print_with_color(:red, " BAD Value\n")
end
```

Which on the affected machine produces this output:

```
1 Process, Async
Maximum Diff: 0.0
2 Process
Maximum Diff: 0.018099353
BAD Value
```

I would expect the differences between the too procedures to be no different considering itâ€™s calling the same functions just in different ways. There is a single process (no communication version of CG included as well.

I cannot reproduce this bug on my laptop and on a few other servers, but a different machine with the same 5930k processor as well as a machine with a 3930k processor both have the same issue. At this point Iâ€™m not sure if this is a bug in the math libraries affecting these processors, or some issue with multi-process serialization. When I change the nparam variable to much smaller values, this does not trigger the bug on my main machine, which is a clue, but still does not indicate a search direction. Julia 0.5 on the same machine does not trigger this, and the same code on a Mac (different processor) also does not trigger this.

Any guidance or help would be appreciated; Iâ€™ve found that this part of the code is affecting the rest of my research, so Iâ€™m glad to have narrowed it down to this point. It seems to be critical enough that itâ€™s producing erroneous code when algorithmically this shouldnâ€™t be the caseâ€¦

To summarize my current thinking, the issue is caused by:

- Linear Algebra functions that call processor dependent functionality
- remote process serialization issue, but only on specific processors?
- combination of the two, or something else?