You can try both these approaches and see which fits your use case better.
- Distributed, multi-process - use
addprocs(N)
andpmap
pseudocode:
addprocs()
@everywhere begin
using DataFrames
function process_file(fname)
.......
end
end
results = pmap(process_file, list_of_files)
- Multi-threaded, single process. Process in sets of N. Psuedocode would be something like this:
const N = 4
const data_chnl = Channel{Any}(N)
@schedule begin
@sync for f in list_of_files
@async put!(data_chnl, readTable(f, sep, h))
end
close(data_chnl)
end
data=[]
for d in data_chnl
push!(data, d)
if length(data) == N
Threads.@threads for d2 in data
process_read_file(d2)
end
empty!(data)
end
end
if length(data) > 0
Threads.@threads for d2 in data
process_read_file(d2)
end
empty!(data)
end