Read lines from file without new allocations

CodeGodz · August 19, 2022, 3:30pm

I have to read a 7TB file and noticed that each line, when read with eachline(..), comes with a new allocation. Since I know that a line can never exceed length X (but can be smaller) I thought I could pre-alloc a Vector 2*X and keep streaming through them to find the lines and use StringView on them to compare the string, or substrings, without allocs. I came up with the following base (still buggy) implementation

using BenchmarkTools
using StringViews

function line_read(file_path::String, query::String)
    handle = open(file_path, "r")
    tot = 0
    matches = 0
    for line in eachline(handle)
        if line == query
            matches +=1
        end
    end
    close(handle)
    return matches
end



function bytes_read(file_path::String, buffer::Int , query::String) # 0x6e = 'n'
    # Allocate buffer, twice as big as buffer
    tot_alloc = buffer * 2
    arr = Vector{UInt8}(undef, tot_alloc)
    fill!(arr, '\0')

    io =  open(file_path, "r")   

    # Keep track where the last line ended in the next iter
    # we need to account for the shift when moving the block
    # (see below)
    from = 1

    # Some bs test
    matches = 0

    while !eof(io)
        # Store new data in first chunk
        readbytes!(io, view(arr, buffer+1:tot_alloc), buffer)  

        # Keep track till where the last line ended in the current iter
        cur_stop = from
                
        @inbounds for i in from:tot_alloc            
            if arr[i] == 0x0a # newline
                
                # Get the current line
                line =  view(arr, cur_stop:i-1)
                
                # Do something with it
                if StringView(line) == query
                    matches +=1
                end
                
                # Update new line location in current and coming iter
                cur_stop = i + 1 
                from = i - buffer + 1
            end
        end

        # Move last read block to front of the array
        @inbounds for i in 1:buffer
            arr[i] = arr[i+buffer]
        end
        
    end
    close(io)
    return matches
  
end
    

function main(copies::Int)
    
    # Write some bs to a file
    open("bs.txt", "w") do handle
        txt = "This is some sentence\nCool line\nThis is another sentence\nNot sure what to write\nbuy yeah\n"
        corpus = txt^copies
        write(handle, corpus)
    end
    
    # Read it
    @btime line_read("bs.txt", "Cool line")
    @btime bytes_read("bs.txt", 10_000, "Cool line")

   #println(line_res)
   #println(byte_res)
    
end

main(1_000_000)

If I call main(100_000_000) my @btime is the following:

  41.016 s (500256358 allocations: 17.92 GiB)
  13.850 s (12 allocations: 20.23 KiB)

It seems to even make a much bigger difference if the lines are short, if we change txt = "Wow\nYe\n" and query to “Ye” it is:

  14.011 s (200015269 allocations: 4.47 GiB)
  1.219 s (12 allocations: 20.23 KiB)

This still has two “bugs”

Will fail for the last line if the file does not end with \n
Will fail if the buffer is set bigger than the file content (i.e. line of 10 chars but buffer = 30)

For me code like this has some advantages aside from being faster as I can directly use indexing/views on the lines (them being UInt8 arrays instead of Strings) to get substrings and parse numbers without allocating

I couldn’t find code to do this elsewhere but since it is pretty straightforward I wonder if this is somewhere, in some package already before I spent time debugging it further? Other tips and improvements are also welcome

CodeGodz · August 19, 2022, 11:15pm

Adjusted the code to solve the two bugs. Will edit it more here if I find another mistake. Would also be possible to check if the buffer size was chosen correctly as each iter (except the last one in case of missing \n) should yield a “sentence”.

Would be even nicer if this could be wrapped in a Generator/Iterator such that it would function in exactly the same way as eachline but then without excessive allocs . Not familiar with how to implement that though…

function bytes_read(file_path::String, buffer::Int , query::String)
    newline = 0x0a
    empty = 0x00

    # Allocate buffer, twice as big as "buffer" argument
    tot_alloc::Int64 = buffer * 2
    arr = Vector{UInt8}(undef, tot_alloc)
    fill!(arr, empty)
    io =  open(file_path, "r")   
   
    # Some bs match count test
    matches = 0
    
    # Keep track of bytes read (below) so we can 
    # strip old data when bytes_read < buffer size
    bytes_read = 0
    
    # Keep track of where the newline characters were
    # in the current iter and where they will be in the 
    # next iter (after moving the block)
    # Note, buffer + 1 as in the 1st iter only the last
    # half of the array is filled
    from = cur_stop = buffer + 1
     
    # Keep reading chunks until we reach the EOF
    while !eof(io)        
        # Move last read chunk to front of the array
        # (useless in first iter)
        @inbounds for i in 1:buffer
            arr[i] = arr[i+buffer]
        end
        
        # Store new chunk in second part of the array
        bytes_read = readbytes!(io, view(arr, buffer+1:tot_alloc), buffer)  
                
        # If we read less than the buffer size we have to reset the array
        # values after "bytes_read" as this is old data (previous read)
        if bytes_read < buffer
            @inbounds for i in buffer+bytes_read+1:tot_alloc
                arr[i] = empty
            end   
        end   
        
        cur_stop = from
        
        # Search for newline chars and generate StringView when found
        @inbounds for i in from:tot_alloc            
            if arr[i] == newline # newline
                line = StringView(view(arr, cur_stop:i-1))
                # Just for testing:
                matches += Int(line == query)
                # Update newline location in current iter
                cur_stop = i + 1 
                # Update newline location for next iter
                from = i - buffer + 1
            end
        end
    end
    
    # We missed the last line when:
    # - there was a missing \n + we read less than the buffer size
    @inbounds if arr[buffer+bytes_read] != newline
        line = StringView(view(arr, cur_stop:buffer+bytes_read))
        matches += Int(line == query) # bs test again
    end
        
    close(io) 
    return matches
end

Topic		Replies	Views
Package to read/process lines without new allocations Package Announcements question , package , announcement	13	985	May 5, 2023
Nonallocating readuntil functionality? General Usage	1	364	August 29, 2020
Performance: read data from ascii file, replace `split` General Usage performance	13	265	November 12, 2024
Using do with file handle somehow causes allocations? Performance question , debugging	4	323	January 9, 2023
Pre-allocated, thread-save buffer arrays General Usage	0	361	March 8, 2021

Read lines from file without new allocations

Related topics