Struggling to use Mmap with ZipArchives

I’m struggling to get ZipArchives.jl and Mmap.jl to work together in the way I want - basically just following the docs.

MWE looks like this:

using ZipArchives
using Mmap

io=open("simpletest.zip")
mm=Mmap.mmap(io)
q = ZipArchives.ZipReader(mm)
readme_n_lines = zip_openentry(q, "xl/worksheets/sheet1.xml") do z
    countlines(z)
end
close(io)
#GC.gc()
ZipArchives.ZipWriter("simpletest.zip") do w 
    zip_newfile(w, "test/test2.txt")
    write(w, "I am data inside test2.txt in the zip file")
end

Which produces the following:

ERROR: LoadError: SystemError: opening file "simpletest.zip": Invalid argument
Stacktrace:
  [1] systemerror(p::String, errno::Int32; extrainfo::Nothing)
    @ Base .\error.jl:176
  [2] systemerror
    @ .\error.jl:176
  [3] systemerror
    @ .\error.jl:175 [inlined]
  [4] open(fname::String; lock::Bool, read::Nothing, write::Bool, create::Nothing, truncate::Nothing, append::Nothing)
    @ Base .\iostream.jl:295
  [5] open
    @ .\iostream.jl:277 [inlined]
  [6] #ZipWriter#17
    @ C:\Users\tim\.julia\packages\ZipArchives\5fdTS\src\writer.jl:65 [inlined]
  [7] ZipWriter(f::Function, filename::String)
    @ ZipArchives C:\Users\tim\.julia\packages\ZipArchives\5fdTS\src\writer.jl:64
  [8] top-level scope
    @ c:\Users\tim\OneDrive\Documents\Julia\ZipArchives\TestMmap.jl:13
  [9] include(fname::String)
    @ Main .\sysimg.jl:38
 [10] run(debug_session::VSCodeDebugger.DebugAdapter.DebugSession, error_handler::VSCodeDebugger.var"#3#4"{String})
    @ VSCodeDebugger.DebugAdapter c:\Users\tim\.vscode\extensions\julialang.language-julia-1.144.2\scripts\packages\DebugAdapter\src\packagedef.jl:123
 [11] startdebugger()
    @ VSCodeDebugger c:\Users\tim\.vscode\extensions\julialang.language-julia-1.144.2\scripts\packages\VSCodeDebugger\src\VSCodeDebugger.jl:47
 [12] top-level scope
    @ c:\Users\tim\.vscode\extensions\julialang.language-julia-1.144.2\scripts\debugger\run_debugger.jl:12
 [13] include(mod::Module, _path::String)
    @ Base .\Base.jl:557
 [14] exec_options(opts::Base.JLOptions)
    @ Base .\client.jl:323
 [15] _start()
    @ Base .\client.jl:531
in expression starting at c:\Users\tim\OneDrive\Documents\Julia\ZipArchives\TestMmap.jl:13

I found this old thread, so I tried swapping the close for a GC.gc() or doing both (close first) or doing neither. Every case resulted in the same error.

In this case, the zip file is a renamed Excel file but, to be honest, I don’t think the file content is causing this issue.

Am I doing something daft, or is there something awry in ZipArchives.jl?

Edit: to add:

pkg> st
Status `C:\Users\tim\OneDrive\Documents\Julia\ZipArchives\Project.toml`
  [72c71f33] XML v0.3.5
  [49080126] ZipArchives v2.4.1
  [a63ad114] Mmap v1.11.0

julia> versioninfo()
Julia Version 1.11.5
Commit 760b2e5b73 (2025-04-14 06:53 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 24 × AMD Ryzen 9 9900X 12-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, generic)
Threads: 8 default, 0 interactive, 4 GC (on 24 virtual cores)
Environment:
  JULIA_EDITOR = code
  JULIA_VSCODE_REPL = 1
  JULIA_NUM_THREADS = 8

Your code as written (with a random xlsx file) runs with no errors for me.

julia> versioninfo()
Julia Version 1.11.5
Commit 760b2e5b739 (2025-04-14 06:53 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 32 × AMD Ryzen 9 9950X 16-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, generic)
Threads: 32 default, 1 interactive, 16 GC (on 32 virtual cores)
Environment:
  JULIA_NUM_THREADS = auto,auto
  JULIA_REVISE_INCLUDE = 1

(jl_sgw3Xt) pkg> st
Status `/tmp/jl_sgw3Xt/Project.toml`
  [49080126] ZipArchives v2.4.1

From the stack trace, it looks like Mmap has nothing to do with the error, the error happens in a call to ZipWriter which is not using the mapped io. I wonder if the Windows version does not allow writing to an existing file without specifying some additional info? The ZipArchives documentation is not as clear about that as I would like.

1 Like

Well, if I take the mmap out of the code, like this:

using ZipArchives
using Mmap

#io=open("simpletest.zip")
#mm=Mmap.mmap(io)
#q = ZipArchives.ZipReader(mm)
q = ZipArchives.ZipReader(read("simpleTest.zip"))
readme_n_lines = zip_openentry(q, "xl/worksheets/sheet1.xml") do z
    countlines(z)
end
#close(io)
#GC.gc()
ZipArchives.ZipWriter("simpleTest.zip") do w 
    zip_newfile(w, "test/test2.txt")
    write(w, "I am data inside test2.txt in the zip file")
end

I don’t see the error and I do get a revised file, so there does seem to be some dependency on mmap.

Perhaps the mmap is still holding some resource? Try putting each block in a function to see if that causes the mmap to be released.

function read()
    open("simpletest.zip") do io
        mm=Mmap.mmap(io)
        q = ZipArchives.ZipReader(mm)
        zip_openentry(q, "xl/worksheets/sheet1.xml") do z
            countlines(z)
        end
    end
end

function write()
    ZipArchives.ZipWriter("simpletest.zip") do w 
        zip_newfile(w, "test/test2.txt")
        write(w, "I am data inside test2.txt in the zip file")
    end
end

readme_n_lines = read()
write()
1 Like

In my real life use case, the ZipReader and ZipWriter calls are distantly separated in different sets of nested calls. Trying the same in my MWE yields the same error as in the OP.

This example points the finger even more clearly at mmap, I think, although it doesn’t help me with a solution:

using ZipArchives
using Mmap

function reading()
    io=open("simpleTest.zip")
    mm=Mmap.mmap(io)
    q = ZipArchives.ZipReader(mm)
    #q = ZipArchives.ZipReader(read("simpleTest.zip"))
    readme_n_lines = zip_openentry(q, "xl/worksheets/sheet1.xml") do z
        countlines(z)
    end
    close(io)
    GC.gc()
end
#function writing()
#    ZipArchives.ZipWriter("simpleTest.zip") do w 
#        zip_newfile(w, "test/test2.txt")
#        write(w, "I am data inside test2.txt in the zip file")
#    end
#end

reading()
isfile("simpleTest.zip") && rm("simpleTest.zip")
#writing()

This generates the following:

ERROR: LoadError: IOError: unlink("simpleTest.zip"): permission denied (EACCES)
Stacktrace:
  [1] uv_error
    @ .\libuv.jl:106 [inlined]
  [2] unlink(p::String)
    @ Base.Filesystem .\file.jl:1105
  [3] rm(path::String; force::Bool, recursive::Bool)
    @ Base.Filesystem .\file.jl:283
  [4] rm(path::String)
    @ Base.Filesystem .\file.jl:273
  [5] top-level scope
    @ c:\Users\tim\OneDrive\Documents\Julia\ZipArchives\TestMmap.jl:24
  [6] include(fname::String)
    @ Main .\sysimg.jl:38
  [7] run(debug_session::VSCodeDebugger.DebugAdapter.DebugSession, error_handler::VSCodeDebugger.var"#3#4"{String})
    @ VSCodeDebugger.DebugAdapter c:\Users\tim\.vscode\extensions\julialang.language-julia-1.144.2\scripts\packages\DebugAdapter\src\packagedef.jl:123
  [8] startdebugger()
    @ VSCodeDebugger c:\Users\tim\.vscode\extensions\julialang.language-julia-1.144.2\scripts\packages\VSCodeDebugger\src\VSCodeDebugger.jl:47
  [9] top-level scope
    @ c:\Users\tim\.vscode\extensions\julialang.language-julia-1.144.2\scripts\debugger\run_debugger.jl:12
 [10] include(mod::Module, _path::String)
    @ Base .\Base.jl:557
 [11] exec_options(opts::Base.JLOptions)
    @ Base .\client.jl:323
 [12] _start()
    @ Base .\client.jl:531
in expression starting at c:\Users\tim\OneDrive\Documents\Julia\ZipArchives\TestMmap.jl:24

EDIT: If I take out all reference to ZipReader and simply mmap the file and then close it again, the file is successfully removed. This suggests it is, in fact, the interaction between ZipReader and mmap that is the problem

That version also runs without error for me. This seems like a windows-specific problem and is likely worth reporting to ZipArchives.

Issue opened here.

ZipReader acts like view here because there is no special mmap logic in ZipArchives.jl.

For example:

using Mmap

fname = tempname()
write(fname, "bar")

function reading()
    io = open(fname)
    mm = Mmap.mmap(io)
    q = view(mm, 1:2)
    close(io)
    GC.gc()
end

reading()

isfile(fname) && rm(fname)

Errors with:

ERROR: LoadError: IOError: unlink("C:\\Users\\nzimm\\AppData\\Local\\Temp\\jl_g1GveD8R03"): permission denied (EACCES)
Stacktrace:
 [1] uv_error
   @ .\libuv.jl:106 [inlined]
 [2] unlink(p::String)
 [1] uv_error
   @ .\libuv.jl:106 [inlined]
 [2] unlink(p::String)
   @ .\libuv.jl:106 [inlined]
 [2] unlink(p::String)
 [2] unlink(p::String)
   @ Base.Filesystem .\file.jl:1105
 [3] rm(path::String; force::Bool, recursive::Bool)
   @ Base.Filesystem .\file.jl:283
 [4] rm(path::String)
   @ Base.Filesystem .\file.jl:273
 [5] top-level scope
   @ C:\Users\nzimm\github\ZipArchives.jl\mmap.jl:16
in expression starting at C:\Users\nzimm\github\ZipArchives.jl\mmap.jl:16

You can get it to work if you move the GC.gc().

using Mmap

fname = tempname()
write(fname, "bar")

function reading()
    io=open(fname)
    mm=Mmap.mmap(io)
    q = view(mm, 1:2)
    close(io)
end

reading()
GC.gc()
GC.gc()
GC.gc()
GC.gc()

isfile(fname) && rm(fname)
1 Like

Mmap can be very tricky, especially when the file is not read only.
ZipReader currently works with any AbstractArray but it could be updated to also support a nicer way to read from files: possibly using DiskArrays.jl

1 Like

So this works after a fashon, but with two caveats:

The XLSX docs indicate that the user should use openxlsx with do-syntax to support reading larger-than-memory files. However, putting the GC.gc call into the do-block doesn’t work - it has to be outside the function. This pushes the GC.gc() call into the user’s code, which seems pretty undesirable. (edit: I guess I could wrap it in another function)

My second caveat is more a reflection on the ironies of life in general.

When I first started using XLSX.jl it was generally considered slow, and the reason for this was identified as a GC.gc call on every read and every write. See here, for instance.

Variously, it was suggested that a solution to this may be found by first, replacing ZipFiles.jl with ZipArchives.jl as a dependency of XLSX.jl, and then by replacing EzXML.jl with XML.jl. I made these switches, which was a bit of a magical mystery tour for me and took me quite some time. Now I’m finished I find I’m back where I started from, depending on GC.gc() to make the code run!

At least I gained a powerful learning experience!

It would be good to escape from the need to invoke garbage collection!

Should this be raised as an issue on Mmap on Windows, then?

Interestingly numpy has the same issue.

I not sure there is a way to generally “safely” eagerly close a mmap, that avoids the chance of use after free.

I created a new thread to discuss adding this feature to the Mmap standard library:

Here are some ideas for avoiding the need for mmap.

  1. Does XLSX.jl need to support larger-than-memory files?

Looking at Excel specifications and limits - Microsoft Support it seems like the format is designed to be fully loaded into memory. Maybe it is okay to do this in XLSX.jl as well.

  1. Using ZipStreams.jl instead of ZipArchives.jl to read larger than memory xlsx files.

ZipStreams.jl can read a zip archive from a file without mmap, however, there might be some xlsx files that are not stream readable: see the warning in

Also, there is still an issue with XML.jl. It needs a Vector{UInt8} for parsing XML, so each entry in the archive still needs to be loaded into memory. XML.jl could possibly be updated to support parsing directly from a ZipStreams.jl entry IO object.

  1. Adding support for the DiskArrays.jl interface.

This interface is designed for reading larger-than-memory data that cannot be easily mmapped.
I’m not sure how difficult this would be to support, and it might require large changes to ZipArchives.jl, XML.jl, and XLSX.jl. I think there is a lot of glue code needed to get this working well, and not falling back to slow paths that read only one byte at a time.

I will try and add this to ZipArchives.jl at some point as a way to also solve

1 Like

Thank you for some thoughtful ideas and suggestions!

  1. I think this is probably a question for the package owner but here are some thoughts from me. Firstly, as a contributor, I feel like I should respect the documented functionality of the package I’m contributing to. The XLSX.jl docs say

If enable_cache=false , worksheet cells will always be read from disk. This is useful when you want to read a spreadsheet that doesn’t fit into memory.

So I take a presumption that anything I do needs to maintain the ability to read files larger than memory. Indeed, when I made an earlier PR, it received a comment saying

One thing to look into is memory usage when reading a large spreadsheet.

:wink:
and I took this to be a reminder of the need to maintain support for larger than memory files.

In terms of usage scenarios, I think an important use of XLSX.jl is to read data from spreadsheets published or shared by third parties. A very large dataset, or several over several worksheets in a single file, may easily be bigger than memory of some computers. XLSX.jl offers the ability to extract a subset of this data without having to load the whole file into memory. I have no idea how often such a situation actually occurs but XLSX.jl has thoughtfully provided functionality to handle it if/when it does and I wouldn’t want to break it without discussion.

  1. This is something I can look at but it would likely be a major change. It would seem undesirable to have XLSX.jl depend on two separate zip packages, though, if it could be avoided. The need is less than before because in my latest PR, there is a GC call only when reading a file with enable_cache=false whereas originally it was called on every read. For many uses the original issue has now gone away.

As far as XML.jl needing a Vector{UInt8}, I’m afraid my undertsanding is somewhat sketchy. In my mind, the fact that mmap can be used to treat the file as a Vector{UInt8} in ZipArchives.jl means the vector is accessed directly in the file rather than being copied into memory. Then, in XLSX.jl, I use LazyNode to access sheet rows, which I understand to mean that only the elements we actually want to read get materialised in memory. I realise I don’t know how to verify my understanding and that it could easily be wrong.

  1. Interesting. Will be curious to see how XLSX.jl could benefit from this, too.

Using zip_readentry will return an uncompressed copy of the entry as a Vector{UInt8}.

To read without needing to load the whole entry into memory, zip_openentry can be used to get an IO stream that can lazily read from the underlying AbstractVector, but XML.jl doesn’t yet support parsing from this.

1 Like

Maybe there would not be too many changes necessary to get option 3) running. I did a few small tests and the following seems to work at least for reading array without relying on Mmap or reading the whole archive, in case someone wants to experiment.

import DiskArrays: AbstractDiskArray, DiskArrays, Unchunked, Chunked, GridChunks

struct SimpleFileDiskArray{C<:Union{Int,Nothing}} <: AbstractDiskArray{UInt8,1}
    file::String
    s::Int
    chunksize::C
end
Base.size(s::SimpleFileDiskArray) = (s.s,)
function SimpleFileDiskArray(filename;chunksize=nothing)
    isfile(filename) || throw(ArgumentError("File $filename does not exist"))
    s = filesize(filename)
    SimpleFileDiskArray(filename, s, chunksize)
end
function DiskArrays.readblock!(a::SimpleFileDiskArray,aout,i::AbstractUnitRange)
    open(a.file) do f
        seek(f,first(i)-1)
        read!(f,aout)
    end
end
DiskArrays.haschunks(a::SimpleFileDiskArray) = a.chunksize === nothing ? Unchunked() : Chunked() 
function DiskArrays.eachchunk(a::SimpleFileDiskArray) 
    if a.chunksize === nothing
        DiskArrays.estimate_chunksize(a)
    else
        GridChunks((a.s,),(a.chunksize,))
    end
end

function Base.copyto!(dest::AbstractArray, desto::Int, src::SimpleFileDiskArray, so::Int,N::Int)
    destv = view(dest,range(desto,length=N))
    DiskArrays.readblock!(src,destv,range(so,length=N))
end

using ZipArchives
file = "../testzip.zip"
r = ZipReader(SimpleFileDiskArray(file))

entries = zip_names(r)
@time data = zip_readentry(r,entries[2])
Char.(data)

Maybe the SimpleFileDiskArray could be added to DiskArrays.jl in case it proves to be useful.

3 Likes

Ok. Thanks for clarifying @nhz2.

So mmap means it is not necessary to read the whole zip archive into memory but it is still necessary to read the individual entry in. That’s still quite helpful if an Excel file has several very large sheets, but less so if it is only one sheet.

1 Like