Sure enough, simple and works.
I’m gonna work it a bit. In go I’d typically have a goroutine per file or per batch of files. I imagine i’d have to rework it quite a bit if I wanted to run a channel on multiple cpu to large sets of files, per the discourse post here: reading-and-processing-data-files-concurrently
Thank you again.
I did noticed if I put the Channel() do
block under scope of the for (root, dirs, files) in walkdir(path)
block i get an error.
ERROR: MethodError: no method matching start(::Void)
Closest candidates are:
start(::SimpleVector) at essentials.jl:258
start(::Base.MethodList) at reflection.jl:560
start(::ExponentialBackOff) at error.jl:107
Here is the refactored bit:
function readFile(path::String)
Channel() do chan
for (root, dirs, files) in walkdir(path)
for filename in files
if !(filename in SKIPFILES) && filename[1] != '.'
fullname = joinpath(root, filename)
if isfile(fullname)
pastHeader, lines = false, Vector{String}()
open(fullname) do f
for line in eachline(f)
if !isvalid(line)
line = decode(convert(Array{UInt8,1}, line), "LATIN1")
end
line = chomp(line)
if pastHeader
push!(lines, line)
elseif endof(line) == 0
pastHeader = true
end
end
end
content = join(lines, NEWLINE)
put!(chan, (fullname,content))
end
end
end
end
end
end
function addData!(df::DataFrame, path::String, classification::String)
for (filename, text) in readFile(path)
push!(df, @data([text, classification, filename]))
end
end
function buildDataSet(sources)
df = DataFrame(text = Vector{String}(), class = Vector{String}(), index = Vector{String}())
for (path, classification) in sources
addData!(df, joinpath(SPAMROOT, path), classification)
end
return df
end