Progress bar on reading a compressed file

I’m trying to have a progress bar (like ProgressMeter.jl) on reading a file. The file might or might not be compressed and a priori the only thing I know is the total number of bytes of the file.

Right now I have

open(fn) do io
    io = compression == :zstd ? CodecZstd.ZstdDecompressorStream(io) : io
    while !eof(io)
        line = readline(io)
        ( ... do stuff ... )
    end
end

I would like to add a progress bar. However for these to be useful in telling how much longer I have to wait, I should pass it the total size of the thing I’m reading. Since the file might be compressed I can’t pass it number of lines, so my idea is pass it total number of bytes, and every time I read some bytes update it. Something like:

import ProgressMeter
fs = filesize(fn)
progress = ProgressMeter.Progress(fs)
open(fn) do io
    io = compression == :zstd ? CodecZstd.ZstdDecompressorStream(io) : io
    while !eof(io)
        line, nbytes = read_line_and_return_bytes(io)
        ( ... do stuff ... )
        Progress.update!(p, nbytes)
    end
end

The ProgressMeter almost has an example [1] how to do this

using ProgressMeter

function readFileLines(fileName::String)
    file = open(fileName,"r")

    seekend(file)
    fileSize = position(file)

    seekstart(file)
    p = Progress(fileSize; dt=1.0)   # minimum update interval: 1 second
    while !eof(file)
        line = readline(file)
        # Here's where you do all the hard, slow work

        update!(p, position(file))
    end
end

except this only works from plain files (as far as I can tell; correct me if I’m wrong). The reason is that position(io) is implemented for IOStream but not for TranscodingStream{CodecZstd.ZstdDecompressor, IOStream}

I can access the thing I want by examining a TranscodingStream’s internals, that’s the stream field [2]

So, this works:

import ProgressMeter
import TranscodingStreams

positionbytes(x::IOStream) = position(x)
positionbytes(x::TranscodingStreams.TranscodingStream{C,IOStream}) where {C} = position(x.stream)

fs = filesize(fn)
progress = ProgressMeter.Progress(fs)
open(fn) do io
    io = compression == :zstd ? CodecZstd.ZstdDecompressorStream(io) : io
    prev_bytes = 0
    while !eof(io)
        line = realine(io)
        ( ... do stuff ... )
        if positionbytes(io) > prev_bytes
            ProgressMeter.update!(p, positionbytes(io))
            prev_bytes = positionbytes(io)
        end
    end
end

This works, but it seems a very clunky way of doing something that I would imagine is a common use case: getting an estimate of how much longer it’s gonna take to process a file.

Any cleaner approaches?

[1] GitHub - timholy/ProgressMeter.jl: Progress meter for long-running computations
[2] TranscodingStreams.jl/src/stream.jl at 76543edbc3e2433d5a83d252d021d352675bf931 · JuliaIO/TranscodingStreams.jl · GitHub

One approach is to keep a reference to the inner IO:

import ProgressMeter
import CodecZstd

fs = filesize(fn)
progress = ProgressMeter.Progress(fs)
open(fn) do file_io
    io = compression == :zstd ? CodecZstd.ZstdDecompressorStream(file_io) : file_io
    try
        prev_bytes = 0
        while !eof(io)
            line = realine(io)
            ( ... do stuff ... )
            if position(file_io) > prev_bytes
                ProgressMeter.update!(p, position(file_io))
                prev_bytes = position(file_io)
            end
        end
    finally
        close(io) # This prevents a memory leak by freeing memory used by zstd
    end
end

Another way is by using TranscodingStreams.stats

positionbytes(x::TranscodingStreams.TranscodingStream) = TranscodingStreams.stats(x).in

Oh I didn’t know about TranscodingStreams.stats. Nice, thank you :slight_smile: