I came across a curious problem writing some code converting files. It turned out to be slower than a previously used scripting language, and not approaching (expected) C speed at all. I could get it to approximate C speed by using open instead of stdin/stdout. I don’t see a good reason for stdin/stdout being this much slower, and would like to get the same speed for stdin/stout (in C stdin/stdout seems generally even a tiny bit faster).
Any ideas? Below test code:
To create a large enough test file:
f=open("csvfile.csv","w")
for i = 1:10000000
write(f,"short,\"with,comma\",\"a bit\tlonger\",\"very\nlong indeed\"\n")
end
close(f)
stdin/stdout version (csv2tsv_stdout.jl)
function csv2tsv()
f=open("tmp/csvfile.csv")
o=open("tmp/csvfile.tsv","w")
prev = '\n'
state = 'c'
while(!eof(f))
line = readline(f)
for c in line
if (c == '"')
if (state == 'c')
state = 'o'
else
state = 'c'
end
if (prev == '"') write(o,'"') end
elseif (c == ',')
if (state == 'c')
write(o,'\t')
else
write(o,',')
end
elseif (c == '\t')
write(o,'\\')
write(o,'t')
else
write(o,c)
end
prev = c
end
if (state == 'c')
write(o,'\n')
else
write(o,'\\')
write(o,'n')
end
prev = '\n'
end
close(o)
close(f)
end
csv2tsv()
file version (csv2tsv_file.jl)
function csv2tsv()
f=open("tmp/csvfile.csv")
o=open("tmp/csvfile.tsv","w")
prev = '\n'
state = 'c'
while(!eof(f))
line = readline(f)
for c in line
if (c == '"')
if (state == 'c')
state = 'o'
else
state = 'c'
end
if (prev == '"') write(o,'"') end
elseif (c == ',')
if (state == 'c')
write(o,'\t')
else
write(o,',')
end
elseif (c == '\t')
write(o,'\\')
write(o,'t')
else
write(o,c)
end
prev = c
end
if (state == 'c')
write(o,'\n')
else
write(o,'\\')
write(o,'n')
end
prev = '\n'
end
close(o)
close(f)
end
csv2tsv()
And the times
time julia julia/csv2tsv_stdout.jl < csvfile.csv > csvfile.tsv real 0m57.462s user 0m57.105s sys 0m0.580s time julia csv2tsv_file.jl real 0m7.463s user 0m4.917s sys 0m0.673s # C version time csv2tsv < csvfile.csv > csvfile.tsv real 0m6.118s user 0m3.577s sys 0m0.469s