Avoiding small strings and using IOBuffer

Recently I’ve been using this bit of code to basically conditionally concatenate a bunch of lines in a file:

buf = IOBuffer()
open("input.txt", "r") do f
    for line in readlines(f)
        if line[1] != "#"
            println(buf, line)
        end
    end
end
out = String(take!(buf))

I’m itching to remove that line variable, whether by seeking until I find a hashtag or similar, but I can’t seem to find any way to read in from a file directly into an IOBuffer. There always seems to be a String return.

Am I overlooking something under the hood that means I shouldn’t care about this? Or am I overlooking something really obvious having to do with the fact that the file itself is already an IO object. It really feels like I should be using that somehow…

Does

IOBuffer(read("input.txt"))

do what you want?

You can also make the code you’ve written somewhat shorter and more efficient with

for line in eachline("input.txt")
    if line[1] != "#"
        println(buf, line)
    end
end

I don’t know how it matches your real needs, but you can also get your end result with a one-liner:

out = join((line for line in eachline(filename, keep = true) if !startswith(line, "#")))

@GunnarFarneback That’s a nice one-liner! But unfortunately none of that avoids the problem of allocating the line string, and seems to introduce some overhead besides.

To do some tests, I created a file of random strings on 100,000 lines

open("testout.txt", "w") do f
    for i in range(1,10000)
        println(f, randstring(5))
    end
end

function test_eachline()
    buf = IOBuffer()
    for line in eachline("testout.txt")
        if line[1] == 'K'
            println(buf, line)
        end
    end
    String(take!(buf))
end

function test_eachline_IOBuffer()
    buf = IOBuffer()
    for line in eachline(IOBuffer(read("testout.txt")))
        if line[1] == 'K'
            println(buf, line)
        end
    end
    String(take!(buf))
end

function test_oneliner()
    join((line for line in eachline("testout.txt", keep = true) if startswith(line, "K")))
end

function test_seekpeek()
    buf = IOBuffer()
    open("testout.txt", "r") do f
        while !eof(f)
            if peek(f, Char) == 'K'
                print(buf, readuntil(f, '\n', keep=true))
            else
                skipchars(!=('\n'), f)
                skip(f, 1)
            end
        end
    end
    String(take!(buf))
end

function test_seekpeek_IOBuffer()
    buf = IOBuffer()
    f = IOBuffer(read("testout.txt"))
    while !eof(f)
        if peek(f, Char) == 'K'
            print(buf, readuntil(f, '\n', keep=true))
        else
            skipchars(!=('\n'), f)
            skip(f, 1)
        end
    end
    String(take!(buf))
end

function test_seekpeek_eachchar()
    buf = IOBuffer()
    f = IOBuffer(read("testout.txt"))
    while !eof(f)
        if peek(f, Char) == 'K'
            ch = read(f, Char)
            while ch != '\n'
                write(buf, ch)
                eof(f) && break
                ch = read(f, Char)
            end
            !eof(f) && write(buf, ch)
        else
            skipchars(!=('\n'), f)
            skip(f, 1)
        end
    end
    String(take!(buf))
end
@btime test_eachline()
# 3.258 ms (100033 allocations: 2.32 MiB)
@btime test_eachline_IOBuffer()
# 3.736 ms (200024 allocations: 15.10 MiB)
@btime test_oneliner()
# 3.604 ms (200035 allocations: 3.85 MiB)
@btime test_seekpeek()
# 226.700 ms (3478 allocations: 312.97 KiB)
@btime test_seekpeek_IOBuffer()
# 2.644 ms (3480 allocations: 872.14 KiB)
@btime test_seekpeek_eachchar()
# 2.648 ms (24 allocations: 615.64 KiB)

The last one finally avoided the sheer number of allocations I wanted to avoid (by avoiding allocating line on every loop), but maybe proved that there’s something under the hood going on that means I shouldn’t care about all those allocations: the total allocated space actually increased over the seek-peek using readuntil.

(edited to hide my use of @time and the irrelevant numbers that generated; and again to fix one-liner)

Most likely because it makes a new generator each time you run it. If you place the code inside a function it won’t, and that’s generally a good idea when you benchmark code, as is using the BenchmarkTools package.

Ah whoops, I should really know better by now; I thought that was just a matter of running it twice. Didn’t realize the generated code could cause problems even after the second time.

I’ve edited the post since I don’t think anyone will actually learn anything from my mis-use of @time.

at least now the char-by-char reading generates the fastest code, barely, but it seems the allocations really have nothing to do with it.

edit: sorry, it’s 3am here so I’m editing and deleting all over the place.

edit2: though discourse’s choice to quote my deleted post was definitely not my exhaustion’s fault…

In Julia 1.11 there will be a copyuntil function that supports this (julia#48273). Until then, you could also use the eachlineV function in the ViewReader package to read lines into a pre-allocated buffer.

Another option would be to mmap the array into an array of bytes, construct a StringView from it (via StringViews.jl), and then use eachsplit to iterate over (copy-free) views of the lines, ala:

using StringViews, Mmap
buf = IOBuffer()
open("input.txt", "r") do f
    s = StringView(mmap(f))
    for line in eachsplit(s, '\n')
        if !startswith(s, "#")
            println(buf, line)
        end
    end
end
out = String(take!(buf))

(Obviously, don’t do this in global scope if you care about performance — put it into a function.)

PS. Note that in your original code, line[1] != "#" will always be false because you are comparing a character line[1] to a string "#". It will also throw an error if the line is empty. Use startswith(line, "#") or !isempty(line) && line[1] != '#'.

1 Like

Fantastic, thank you! ViewReader is exactly what I was looking for and really takes the cake for speed:

using StringViews, Mmap
function test_StringViews()
    buf = IOBuffer()
    open("testout.txt", "r") do f
        s = StringView(mmap(f))
        for line in eachsplit(s, '\n')
            if startswith(line, "K")
                println(buf, line)
            end
        end
    end
    String(take!(buf))
end

using ViewReader
function test_ViewReader()
    buf = IOBuffer()
    for line in eachlineV("testout.txt")
        if startswith(line, "K")
            println(buf, line)
        end
    end
    String(take!(buf))
end
@btime test_StringViews()
# 2.416 ms (3407 allocations: 161.67 KiB)
@btime test_ViewReader()
# 776.402 μs (3405 allocations: 234.02 KiB)