Adjusted the code to solve the two bugs. Will edit it more here if I find another mistake. Would also be possible to check if the buffer size was chosen correctly as each iter (except the last one in case of missing \n
) should yield a “sentence”.
Would be even nicer if this could be wrapped in a Generator/Iterator such that it would function in exactly the same way as eachline
but then without excessive allocs . Not familiar with how to implement that though…
function bytes_read(file_path::String, buffer::Int , query::String)
newline = 0x0a
empty = 0x00
# Allocate buffer, twice as big as "buffer" argument
tot_alloc::Int64 = buffer * 2
arr = Vector{UInt8}(undef, tot_alloc)
fill!(arr, empty)
io = open(file_path, "r")
# Some bs match count test
matches = 0
# Keep track of bytes read (below) so we can
# strip old data when bytes_read < buffer size
bytes_read = 0
# Keep track of where the newline characters were
# in the current iter and where they will be in the
# next iter (after moving the block)
# Note, buffer + 1 as in the 1st iter only the last
# half of the array is filled
from = cur_stop = buffer + 1
# Keep reading chunks until we reach the EOF
while !eof(io)
# Move last read chunk to front of the array
# (useless in first iter)
@inbounds for i in 1:buffer
arr[i] = arr[i+buffer]
end
# Store new chunk in second part of the array
bytes_read = readbytes!(io, view(arr, buffer+1:tot_alloc), buffer)
# If we read less than the buffer size we have to reset the array
# values after "bytes_read" as this is old data (previous read)
if bytes_read < buffer
@inbounds for i in buffer+bytes_read+1:tot_alloc
arr[i] = empty
end
end
cur_stop = from
# Search for newline chars and generate StringView when found
@inbounds for i in from:tot_alloc
if arr[i] == newline # newline
line = StringView(view(arr, cur_stop:i-1))
# Just for testing:
matches += Int(line == query)
# Update newline location in current iter
cur_stop = i + 1
# Update newline location for next iter
from = i - buffer + 1
end
end
end
# We missed the last line when:
# - there was a missing \n + we read less than the buffer size
@inbounds if arr[buffer+bytes_read] != newline
line = StringView(view(arr, cur_stop:buffer+bytes_read))
matches += Int(line == query) # bs test again
end
close(io)
return matches
end