Thank you for your help. I benchmarked your solution agains the “naive” one below
using Libz
function dofields(f, io;
progress = 1000000, limit = 0, delim = ';', maxlines = 0)
line = 0
while !eof(io)
if isa(progress, Integer) && line % progress == 0
print(".")
end
line += 1
if maxlines > 0 && line ≥ maxlines
break
end
f(split(readline(io), delim; limit = limit))
end
end
function stats(filename; options...)
io = open(filename, "r")
c = Dict{String,Int}()
dofields(ZlibInflateInputStream(io);
limit = 7, delim = ';', options...) do fields
kind = fields[6]
c[kind] = get(c, kind, 0) + 1
end
close(io)
c
end
and using the getiterator
approach takes about 10x longer (850s vs the 90s for dofields
above).
I wonder if I could have my cake and eat it to, and define a Data.Source
for this particular purpose. I have looked at the functions, and it is unclear to me how to make a NamedTuple
from a given set of column names (I guess I cannot use @NT
since they are not known at compile time). I am actually reading the column names from another file.