I have to read a 7TB
file and noticed that each line, when read with eachline(..)
, comes with a new allocation. Since I know that a line can never exceed length X
(but can be smaller) I thought I could pre-alloc a Vector 2*X
and keep streaming through them to find the lines and use StringView
on them to compare the string, or substrings, without allocs. I came up with the following base (still buggy) implementation
using BenchmarkTools
using StringViews
function line_read(file_path::String, query::String)
handle = open(file_path, "r")
tot = 0
matches = 0
for line in eachline(handle)
if line == query
matches +=1
end
end
close(handle)
return matches
end
function bytes_read(file_path::String, buffer::Int , query::String) # 0x6e = 'n'
# Allocate buffer, twice as big as buffer
tot_alloc = buffer * 2
arr = Vector{UInt8}(undef, tot_alloc)
fill!(arr, '\0')
io = open(file_path, "r")
# Keep track where the last line ended in the next iter
# we need to account for the shift when moving the block
# (see below)
from = 1
# Some bs test
matches = 0
while !eof(io)
# Store new data in first chunk
readbytes!(io, view(arr, buffer+1:tot_alloc), buffer)
# Keep track till where the last line ended in the current iter
cur_stop = from
@inbounds for i in from:tot_alloc
if arr[i] == 0x0a # newline
# Get the current line
line = view(arr, cur_stop:i-1)
# Do something with it
if StringView(line) == query
matches +=1
end
# Update new line location in current and coming iter
cur_stop = i + 1
from = i - buffer + 1
end
end
# Move last read block to front of the array
@inbounds for i in 1:buffer
arr[i] = arr[i+buffer]
end
end
close(io)
return matches
end
function main(copies::Int)
# Write some bs to a file
open("bs.txt", "w") do handle
txt = "This is some sentence\nCool line\nThis is another sentence\nNot sure what to write\nbuy yeah\n"
corpus = txt^copies
write(handle, corpus)
end
# Read it
@btime line_read("bs.txt", "Cool line")
@btime bytes_read("bs.txt", 10_000, "Cool line")
#println(line_res)
#println(byte_res)
end
main(1_000_000)
If I call main(100_000_000)
my @btime
is the following:
41.016 s (500256358 allocations: 17.92 GiB)
13.850 s (12 allocations: 20.23 KiB)
It seems to even make a much bigger difference if the lines are short, if we change txt = "Wow\nYe\n"
and query to “Ye” it is:
14.011 s (200015269 allocations: 4.47 GiB)
1.219 s (12 allocations: 20.23 KiB)
This still has two “bugs”
- Will fail for the last line if the file does not end with
\n
- Will fail if the
buffer
is set bigger than the file content (i.e. line of 10 chars butbuffer = 30
)
For me code like this has some advantages aside from being faster as I can directly use indexing/views on the lines (them being UInt8
arrays instead of String
s) to get substrings and parse numbers without allocating
I couldn’t find code to do this elsewhere but since it is pretty straightforward I wonder if this is somewhere, in some package already before I spent time debugging it further? Other tips and improvements are also welcome