Hi,
I have an error while processing dataframes in parallel. This is a toy program reproducing the problem with only one worker. Thank you for your help !
using Distributed
using DataFrames
df1 = DataFrame(A = ["R1", "R2", "R3", "R4"], B = 1:4 )
df2 = DataFrame(A = ["R1", "R2", "R3", "R4"], C = 3:6 )
n = 1
addprocs(n); # add worker processes
jobs = RemoteChannel(()->Channel{Tuple}(n))
results = RemoteChannel(()->Channel{Tuple}(n))
@everywhere function do_merge(jobs, results) # define work function everywhere
while true
f = take!(jobs)
f1 = f[1]
f2 = f[2]
M = join(f1,f2, on = names(f1)[1], kind = :outer)
put!(results, (myid(), M))
end
end
function make_jobs(df1, df2)
put!(jobs, (df1, df2))
end
make_jobs(df1, df2)
for p in workers() # start tasks on the workers to process requests in parallel
remote_do(do_merge, p, jobs, results)
end
@elapsed while n > 0 # print out results
workId, table = take!(results)
println("$n - files merged by worker $workId")
println(table)
n = n - 1
end
The error is
From worker 2: (::Distributed.var"#115#117"{Distributed.RemoteDoMsg})() at ./task.jl:333KeyError: key DataFrames [a93c6f00-e57d-5684-b7b6-d8193f3e46c0] not found