An update on this.
With ZipArchives.jl v2.5.0 and InputBuffers.jl v1.1.1 there should now be reasonable performance as long as the wrapped array has fast five argument copyto!
and view
functions. Here is an example from ZipArchives.jl/test/test_file-array.jl at v2.5.0 · JuliaIO/ZipArchives.jl · GitHub
Based on the suggestion in Struggling to use Mmap with ZipArchives - #19 by fabiangans
struct FileArray <: AbstractVector{UInt8}
filename::String
offset::Int64
len::Int64
end
function FileArray(filename::String, offset::Int64=Int64(0))
len = filesize(filename)
len ≥ 0 || error("filesize of $(repr(filename)) is negative")
offset ≥ 0 || error("offset $(offset) is negative")
offset ≤ len || error("offset $(offset) is larger than the filesize $(len)")
FileArray(filename, offset, len-offset)
end
Base.size(s::FileArray) = (s.len,)
function Base.getindex(s::FileArray, i::Int)::UInt8
copyto!(zeros(UInt8, 1), Int64(1), s, Int64(i), Int64(1))[1]
end
function Base.view(s::FileArray, inds::UnitRange{Int64})::FileArray
checkbounds(s, inds)
FileArray(s.filename, s.offset + first(inds) - Int64(1), length(inds))
end
dest_types = if VERSION ≥ v"1.11"
(Vector{UInt8}, Memory{UInt8},)
else
(Vector{UInt8},)
end
for dest_type in dest_types
@eval begin
function Base.copyto!(dest::$dest_type, dstart::Int64, src::FileArray, sstart::Int64, n::Int64)
iszero(n) && return dest
n ≥ 0 || throw(ArgumentError("tried to copy n=$(n) elements, but n should be non-negative"))
checkbounds(dest, dstart)
checkbounds(src, sstart)
checkbounds(dest, dstart + n - Int64(1))
checkbounds(src, sstart + n - Int64(1))
open(src.filename) do io
seek(io, src.offset + sstart - Int64(1))
nb = readbytes!(io, view(dest, range(dstart; length=n)))
nb == n || error("short read")
end
return dest
end
end
end
Here is a benchmark of checksumming all the files in the julia source code, showing all methods have roughly similar timings.
I’m running this on a Linux machine with a lot of memory, so this is a best-case scenario.
julia> using ZipArchives, Downloads, p7zip_jll, ZipStreams
julia> fname = Downloads.download("https://github.com/JuliaLang/julia/archive/refs/tags/v1.11.5.zip")
"/tmp/jl_yTIrEDCdMv"
julia> run(`$(p7zip()) t -tZIP $(fname)`);
julia> zip_test(ZipReader(FileArray(fname)))
julia> zip_test(ZipReader(read(fname)))
julia> ZipStreams.zipsource(ZipStreams.is_valid!, fname);
julia> @time run(`$(p7zip()) t -tZIP $(fname)`);
7-Zip (a) [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,16 CPUs Intel(R) Core(TM) i7-10700KF CPU @ 3.80GHz (A0655),ASM,AES-NI)
Scanning the drive for archives:
1 file, 8095624 bytes (7906 KiB)
Testing archive: /tmp/jl_yTIrEDCdMv
--
Path = /tmp/jl_yTIrEDCdMv
Type = zip
Physical Size = 8095624
Comment = 760b2e5b7396f9cc0da5efce0cadd5d1974c4069
Everything is Ok
Folders: 376
Files: 1632
Size: 26072257
Compressed: 8095624
0.112718 seconds (625 allocations: 37.750 KiB)
julia> @time zip_test(ZipReader(read(fname)))
0.080154 seconds (46.31 k allocations: 151.145 MiB, 12.01% gc time)
julia> @time zip_test(ZipReader(FileArray(fname)))
0.093854 seconds (94.06 k allocations: 154.012 MiB, 8.04% gc time)
julia> @time ZipStreams.zipsource(ZipStreams.is_valid!, fname)
0.115305 seconds (371.25 k allocations: 230.358 MiB, 11.11% gc time)
true