Read lines from file without new allocations

I have to read a 7TB file and noticed that each line, when read with eachline(..), comes with a new allocation. Since I know that a line can never exceed length X (but can be smaller) I thought I could pre-alloc a Vector 2*X and keep streaming through them to find the lines and use StringView on them to compare the string, or substrings, without allocs. I came up with the following base (still buggy) implementation

using BenchmarkTools
using StringViews

function line_read(file_path::String, query::String)
    handle = open(file_path, "r")
    tot = 0
    matches = 0
    for line in eachline(handle)
        if line == query
            matches +=1
        end
    end
    close(handle)
    return matches
end



function bytes_read(file_path::String, buffer::Int , query::String) # 0x6e = 'n'
    # Allocate buffer, twice as big as buffer
    tot_alloc = buffer * 2
    arr = Vector{UInt8}(undef, tot_alloc)
    fill!(arr, '\0')

    io =  open(file_path, "r")   

    # Keep track where the last line ended in the next iter
    # we need to account for the shift when moving the block
    # (see below)
    from = 1

    # Some bs test
    matches = 0

    while !eof(io)
        # Store new data in first chunk
        readbytes!(io, view(arr, buffer+1:tot_alloc), buffer)  

        # Keep track till where the last line ended in the current iter
        cur_stop = from
                
        @inbounds for i in from:tot_alloc            
            if arr[i] == 0x0a # newline
                
                # Get the current line
                line =  view(arr, cur_stop:i-1)
                
                # Do something with it
                if StringView(line) == query
                    matches +=1
                end
                
                # Update new line location in current and coming iter
                cur_stop = i + 1 
                from = i - buffer + 1
            end
        end

        # Move last read block to front of the array
        @inbounds for i in 1:buffer
            arr[i] = arr[i+buffer]
        end
        
    end
    close(io)
    return matches
  
end
    

function main(copies::Int)
    
    # Write some bs to a file
    open("bs.txt", "w") do handle
        txt = "This is some sentence\nCool line\nThis is another sentence\nNot sure what to write\nbuy yeah\n"
        corpus = txt^copies
        write(handle, corpus)
    end
    
    # Read it
    @btime line_read("bs.txt", "Cool line")
    @btime bytes_read("bs.txt", 10_000, "Cool line")

   #println(line_res)
   #println(byte_res)
    
end

main(1_000_000)

If I call main(100_000_000) my @btime is the following:

  41.016 s (500256358 allocations: 17.92 GiB)
  13.850 s (12 allocations: 20.23 KiB)

It seems to even make a much bigger difference if the lines are short, if we change txt = "Wow\nYe\n" and query to “Ye” it is:

  14.011 s (200015269 allocations: 4.47 GiB)
  1.219 s (12 allocations: 20.23 KiB)

This still has two “bugs”

  • Will fail for the last line if the file does not end with \n
  • Will fail if the buffer is set bigger than the file content (i.e. line of 10 chars but buffer = 30)

For me code like this has some advantages aside from being faster as I can directly use indexing/views on the lines (them being UInt8 arrays instead of Strings) to get substrings and parse numbers without allocating

I couldn’t find code to do this elsewhere but since it is pretty straightforward I wonder if this is somewhere, in some package already before I spent time debugging it further? Other tips and improvements are also welcome :slight_smile:

2 Likes

Adjusted the code to solve the two bugs. Will edit it more here if I find another mistake. Would also be possible to check if the buffer size was chosen correctly as each iter (except the last one in case of missing \n) should yield a “sentence”.

Would be even nicer if this could be wrapped in a Generator/Iterator such that it would function in exactly the same way as eachline but then without excessive allocs :slight_smile: . Not familiar with how to implement that though…

function bytes_read(file_path::String, buffer::Int , query::String)
    newline = 0x0a
    empty = 0x00

    # Allocate buffer, twice as big as "buffer" argument
    tot_alloc::Int64 = buffer * 2
    arr = Vector{UInt8}(undef, tot_alloc)
    fill!(arr, empty)
    io =  open(file_path, "r")   
   
    # Some bs match count test
    matches = 0
    
    # Keep track of bytes read (below) so we can 
    # strip old data when bytes_read < buffer size
    bytes_read = 0
    
    # Keep track of where the newline characters were
    # in the current iter and where they will be in the 
    # next iter (after moving the block)
    # Note, buffer + 1 as in the 1st iter only the last
    # half of the array is filled
    from = cur_stop = buffer + 1
     
    # Keep reading chunks until we reach the EOF
    while !eof(io)        
        # Move last read chunk to front of the array
        # (useless in first iter)
        @inbounds for i in 1:buffer
            arr[i] = arr[i+buffer]
        end
        
        # Store new chunk in second part of the array
        bytes_read = readbytes!(io, view(arr, buffer+1:tot_alloc), buffer)  
                
        # If we read less than the buffer size we have to reset the array
        # values after "bytes_read" as this is old data (previous read)
        if bytes_read < buffer
            @inbounds for i in buffer+bytes_read+1:tot_alloc
                arr[i] = empty
            end   
        end   
        
        cur_stop = from
        
        # Search for newline chars and generate StringView when found
        @inbounds for i in from:tot_alloc            
            if arr[i] == newline # newline
                line = StringView(view(arr, cur_stop:i-1))
                # Just for testing:
                matches += Int(line == query)
                # Update newline location in current iter
                cur_stop = i + 1 
                # Update newline location for next iter
                from = i - buffer + 1
            end
        end
    end
    
    # We missed the last line when:
    # - there was a missing \n + we read less than the buffer size
    @inbounds if arr[buffer+bytes_read] != newline
        line = StringView(view(arr, cur_stop:buffer+bytes_read))
        matches += Int(line == query) # bs test again
    end
        
    close(io) 
    return matches
end