How to download, extract and import a zipped or tgz csv file from internet?

Here is one example of a compressed CSV file (if you have single files there isn’t really a point in using Tar, but the example can be modified to put a Tar.jl pipeline in there too):

Testfile:

$ gzip -dc file.csv.gz 
a,b
1,a
2,b
3,c

Example 1: download → decompress → CSV → DataFrame:

import Downloads, SimpleBufferStream, CodecZlib, CSV, DataFrames

url = "file://localhost$(pwd())/file.csv.gz";

df = @sync begin
    # BufferStream for in-flight bytes
    bs = SimpleBufferStream.BufferStream()
    # Download bytes into a decompressor stream
    @async begin
        io = CodecZlib.GzipDecompressorStream(bs)
        Downloads.download(url, io)
        close(io) # close to signal we are done
    end
    # Read decompressed bytes from bs into a DataFrame
    csv_task = @async begin
        f = CSV.File(bs)
        DataFrames.DataFrame(f)
    end
    df = fetch(csv_task)
end

This gives:

julia> df
3×2 DataFrame
 Row │ a      b       
     │ Int64  String1 
─────┼────────────────
   1 │     1  a
   2 │     2  b
   3 │     3  c

Example 2: download → decompress → modify some bytes → CSV → DataFrame

import Downloads, SimpleBufferStream, CodecZlib, CSV, DataFrames

url = "file://localhost$(pwd())/file.csv.gz";

df = @sync begin
    # BufferStream for in-flight bytes
    bs1 = SimpleBufferStream.BufferStream()
    bs2 = SimpleBufferStream.BufferStream()
    # Download bytes into a decompressor stream
    @async begin
        io = CodecZlib.GzipDecompressorStream(bs1)
        Downloads.download(url, io)
        close(io) # close to signal we are done
    end
    # Rewrite 'a'-bytes to 'z'-bytes
    @async begin
        while !eof(bs1)
            bytes = readavailable(bs1)
            for i in eachindex(bytes)
                b = bytes[i]
                if b == UInt8('a')
                    bytes[i] = UInt8('z')
                end
            end
            write(bs2, bytes)
        end
        close(bs2) # close to signal we are done
    end
    # Read modified bytes from bs2 into a DataFrame
    csv_task = @async begin
        # Read decompressed bytes from bs2
        f = CSV.File(bs2)
        # Create a DataFrame
        DataFrames.DataFrame(f)
    end
    df = fetch(csv_task)
end

This gives:

julia> df
3×2 DataFrame
 Row │ z      b       
     │ Int64  String1 
─────┼────────────────
   1 │     1  z
   2 │     2  b
   3 │     3  c

Note that you can use HTTP.get(url; response_io = io) instead of Downloads.download(url, io), but HTTP.jl doesn’t support file:// URLs so used Downloads.jl in this example.

2 Likes