I have to read a 7TB file and noticed that each line, when read with eachline(..), comes with a new allocation. Since I know that a line can never exceed length X (but can be smaller) I thought I could pre-alloc a Vector 2*X and keep streaming through them to find the lines and use StringView on them to compare the string, or substrings, without allocs. I came up with the following base (still buggy) implementation
using BenchmarkTools
using StringViews
function line_read(file_path::String, query::String)
    handle = open(file_path, "r")
    tot = 0
    matches = 0
    for line in eachline(handle)
        if line == query
            matches +=1
        end
    end
    close(handle)
    return matches
end
function bytes_read(file_path::String, buffer::Int , query::String) # 0x6e = 'n'
    # Allocate buffer, twice as big as buffer
    tot_alloc = buffer * 2
    arr = Vector{UInt8}(undef, tot_alloc)
    fill!(arr, '\0')
    io =  open(file_path, "r")   
    # Keep track where the last line ended in the next iter
    # we need to account for the shift when moving the block
    # (see below)
    from = 1
    # Some bs test
    matches = 0
    while !eof(io)
        # Store new data in first chunk
        readbytes!(io, view(arr, buffer+1:tot_alloc), buffer)  
        # Keep track till where the last line ended in the current iter
        cur_stop = from
                
        @inbounds for i in from:tot_alloc            
            if arr[i] == 0x0a # newline
                
                # Get the current line
                line =  view(arr, cur_stop:i-1)
                
                # Do something with it
                if StringView(line) == query
                    matches +=1
                end
                
                # Update new line location in current and coming iter
                cur_stop = i + 1 
                from = i - buffer + 1
            end
        end
        # Move last read block to front of the array
        @inbounds for i in 1:buffer
            arr[i] = arr[i+buffer]
        end
        
    end
    close(io)
    return matches
  
end
    
function main(copies::Int)
    
    # Write some bs to a file
    open("bs.txt", "w") do handle
        txt = "This is some sentence\nCool line\nThis is another sentence\nNot sure what to write\nbuy yeah\n"
        corpus = txt^copies
        write(handle, corpus)
    end
    
    # Read it
    @btime line_read("bs.txt", "Cool line")
    @btime bytes_read("bs.txt", 10_000, "Cool line")
   #println(line_res)
   #println(byte_res)
    
end
main(1_000_000)
If I call main(100_000_000) my @btime is the following:
  41.016 s (500256358 allocations: 17.92 GiB)
  13.850 s (12 allocations: 20.23 KiB)
It seems to even make a much bigger difference if the lines are short, if we change txt = "Wow\nYe\n" and query to “Ye” it is:
  14.011 s (200015269 allocations: 4.47 GiB)
  1.219 s (12 allocations: 20.23 KiB)
This still has two “bugs”
- Will fail for the last line if the file does not end with \n
- Will fail if the bufferis set bigger than the file content (i.e. line of 10 chars butbuffer = 30)
For me code like this has some advantages aside from being faster as I can directly use indexing/views on the lines (them being UInt8 arrays instead of Strings) to get substrings and parse numbers without allocating
I couldn’t find code to do this elsewhere but since it is pretty straightforward I wonder if this is somewhere, in some package already before I spent time debugging it further? Other tips and improvements are also welcome 