Reading and processing Data files concurrently

You can try both these approaches and see which fits your use case better.

  1. Distributed, multi-process - use addprocs(N) and pmap

pseudocode:

addprocs()
@everywhere begin
   using DataFrames
   function process_file(fname)
   .......
   end
end

results = pmap(process_file, list_of_files)
  1. Multi-threaded, single process. Process in sets of N. Psuedocode would be something like this:
const N = 4
const data_chnl = Channel{Any}(N)
@schedule begin
  @sync for f in list_of_files
    @async put!(data_chnl, readTable(f, sep, h))
  end
  close(data_chnl)
end

data=[]
for d in data_chnl
  push!(data, d)
  if length(data) == N
    Threads.@threads for d2 in data
       process_read_file(d2)
    end
    empty!(data)
  end 
end

if length(data) > 0
 Threads.@threads for d2 in data
   process_read_file(d2)
 end
 empty!(data)
end 
2 Likes