Here is one example of a compressed CSV file (if you have single files there isn’t really a point in using Tar, but the example can be modified to put a Tar.jl pipeline in there too):
Testfile:
$ gzip -dc file.csv.gz
a,b
1,a
2,b
3,c
Example 1: download → decompress → CSV → DataFrame:
import Downloads, SimpleBufferStream, CodecZlib, CSV, DataFrames
url = "file://localhost$(pwd())/file.csv.gz";
df = @sync begin
# BufferStream for in-flight bytes
bs = SimpleBufferStream.BufferStream()
# Download bytes into a decompressor stream
@async begin
io = CodecZlib.GzipDecompressorStream(bs)
Downloads.download(url, io)
close(io) # close to signal we are done
end
# Read decompressed bytes from bs into a DataFrame
csv_task = @async begin
f = CSV.File(bs)
DataFrames.DataFrame(f)
end
df = fetch(csv_task)
end
This gives:
julia> df
3×2 DataFrame
Row │ a b
│ Int64 String1
─────┼────────────────
1 │ 1 a
2 │ 2 b
3 │ 3 c
Example 2: download → decompress → modify some bytes → CSV → DataFrame
import Downloads, SimpleBufferStream, CodecZlib, CSV, DataFrames
url = "file://localhost$(pwd())/file.csv.gz";
df = @sync begin
# BufferStream for in-flight bytes
bs1 = SimpleBufferStream.BufferStream()
bs2 = SimpleBufferStream.BufferStream()
# Download bytes into a decompressor stream
@async begin
io = CodecZlib.GzipDecompressorStream(bs1)
Downloads.download(url, io)
close(io) # close to signal we are done
end
# Rewrite 'a'-bytes to 'z'-bytes
@async begin
while !eof(bs1)
bytes = readavailable(bs1)
for i in eachindex(bytes)
b = bytes[i]
if b == UInt8('a')
bytes[i] = UInt8('z')
end
end
write(bs2, bytes)
end
close(bs2) # close to signal we are done
end
# Read modified bytes from bs2 into a DataFrame
csv_task = @async begin
# Read decompressed bytes from bs2
f = CSV.File(bs2)
# Create a DataFrame
DataFrames.DataFrame(f)
end
df = fetch(csv_task)
end
This gives:
julia> df
3×2 DataFrame
Row │ z b
│ Int64 String1
─────┼────────────────
1 │ 1 z
2 │ 2 b
3 │ 3 c
Note that you can use HTTP.get(url; response_io = io) instead of Downloads.download(url, io), but HTTP.jl doesn’t support file:// URLs so used Downloads.jl in this example.