Skipping a lot of lines in CSV.read() allocates too much memory

aris · February 20, 2024, 1:05pm

Here is a summary of some of the solutions presented so far in this thread.

Most refer to the lammps “dump file” format, so here is a code that generates an identical file, for reproducibility.

Creating file

# CREATING FILE
filename = "generated_file.lammpstrj"

steps::Int64 = 1e2
natoms::Int64 = 1e3
open(filename, "w") do f
    for i in 1:steps
        println(f, "ITEM: TIMESTEP")
        println(f, "$i")
        println(f, "ITEM: NUMBER OF ATOMS")
        println(f, "$natoms")
        println(f, "ITEM: BOX BOUNDS pp pp pp")
        println(f, "$(rand()) $(rand())")
        println(f, "$(rand()) $(rand())")
        println(f, "$(rand()) $(rand())")
        println(f, "ITEM: ATOMS id type xs ys zs")
        for j in 1:natoms
            println(f, "$j 1 $(rand()) $(rand()) $(rand())")
        end
    end
end

Here are some ways to read the data from the file.

CSV skiplines loop


#Too slow, dont even bother!

for i in 1:steps
    boxlengths = CSV.read(filename, DataFrame, skipto=floor(Int, (i - 1) * (natoms + 9) + 6), limit=natoms, header=false, delim=' ')
    positions = CSV.read(filename, DataFrame, skipto=floor(Int, (i - 1) * (natoms + 9) + 10), limit=natoms, header=false, delim=' ')
end

Numpy.loadtxt

np = pyimport("numpy")

@benchmark for i in 1:steps
    boxlengths = pyconvert(Matrix, np.loadtxt(filename, skiprows=floor(Int,(i - 1) * (natoms + 9) + 5), max_rows=3, delimiter=' '))
    positions = pyconvert(Matrix, np.loadtxt(filename, skiprows=floor(Int, (i - 1) * (natoms + 9) + 9), max_rows=natoms, delimiter=' '))
end



BenchmarkTools.Trial: 3 samples with 1 evaluation.
 Range (min … max):  1.958 s …  1.970 s  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     1.963 s             ┊ GC (median):    0.00%
 Time  (mean ± σ):   1.964 s ± 5.719 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █                       █                              █  
  █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
  1.96 s        Histogram: frequency by time        1.97 s <

 Memory estimate: 4.12 MiB, allocs estimate: 9196.

read line by line, parse to Vector{Vector{Float}}

boxlengthsv::Vector{Float64} = zeros(3)
positionsv::Vector{Vector{Float64}} = [zeros(3) for _ in 1:natoms]

@benchmark open(filename) do io

    for s in 1:steps

        # Skip headers
        for _ in 1:5
            readline(io)
        end

        # Read box bounds
        for i in 1:3
            boxlengthsv[i] = sum(abs.(parse.(Float64, split(readline(io), ' '))))
        end

        # Skip header
        readline(io)

        # Read positions
        for i in 1:natoms
            positionsv[i] = parse.(Float64, split(readline(io), ' '))[3:5]
        end

        #Do calculations here

    end

end

BenchmarkTools.Trial: 21 samples with 1 evaluation.
 Range (min … max):  238.321 ms … 251.300 ms  ┊ GC (min … max): 1.62% … 2.71%
 Time  (median):     243.535 ms               ┊ GC (median):    1.48%
 Time  (mean ± σ):   243.903 ms ±   3.424 ms  ┊ GC (mean ± σ):  1.88% ± 0.78%

  ▁  ▁     ▁ ▁▁ ▁▁  ▁▁   ▁▁ ▁  ▁  █  ▁   ▁    ▁▁     ▁        ▁
  █▁▁█▁▁▁▁▁█▁██▁██▁▁██▁▁▁██▁█▁▁█▁▁█▁▁█▁▁▁█▁▁▁▁██▁▁▁▁▁█▁▁▁▁▁▁▁▁█ ▁
  238 ms           Histogram: frequency by time          251 ms <

 Memory estimate: 50.70 MiB, allocs estimate: 502007.

@SteffenPL method

@benchmark open(filename) do f

    while !eof(f)
        readuntil(f, "ITEM: BOX BOUNDS")
        boxsize = CSV.read(IOBuffer(readuntil(f, "ITEM: ATOMS")), DataFrame, header=false, skipto=2)
        positions = CSV.read(IOBuffer(readuntil(f, "ITEM: ")), DataFrame, header=false, skipto=2)

    end
end

BenchmarkTools.Trial: 12 samples with 1 evaluation.
 Range (min … max):  297.871 ms … 617.403 ms  ┊ GC (min … max): 0.00% … 1.59%
 Time  (median):     423.428 ms               ┊ GC (median):    0.69%
 Time  (mean ± σ):   423.796 ms ±  88.632 ms  ┊ GC (mean ± σ):  0.84% ± 0.77%

  █ █    █          █ █ █  ███      █      █                  █
  █▁█▁▁▁▁█▁▁▁▁▁▁▁▁▁▁█▁█▁█▁▁███▁▁▁▁▁▁█▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
  298 ms           Histogram: frequency by time          617 ms <

 Memory estimate: 26.47 MiB, allocs estimate: 142004.

@rafael.guerra solution using a structure

Base.@kwdef mutable struct MyData
    timestep::Vector{Int} = Int[]
    natoms::Int = 0
    boxsize::Matrix{Float64} = Array{Float64}(undef, 0, 0)
    positions::DataFrame = DataFrame()
end

M = MyData()

# Read repeated items only once:
open(filename) do f
    readuntil(f, "ITEM: NUMBER OF ATOMS")
    M.natoms = parse(Int, readuntil(f, "ITEM: BOX BOUNDS pp pp pp"))
    M.boxsize = Matrix{Float64}(CSV.read(IOBuffer(readuntil(f, "ITEM: ATOMS")), DataFrame, header=false))
end

# Read varying items:
@benchmark open(filename) do f
    readline(f)
    while !eof(f)
        push!(M.timestep, parse(Int, readuntil(f, "ITEM: NUMBER OF ATOMS")))
        readuntil(f, "ITEM: ATOMS")
        M.positions = vcat(M.positions, CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), DataFrame; delim=' ', ignorerepeated=true))
    end
end

# Access loaded data with:

M.timestep
M.natoms
M.boxsize
M.positions
M.positions.zs

BenchmarkTools.Trial: 3 samples with 1 evaluation.
 Range (min … max):  1.586 s …    1.973 s  ┊ GC (min … max): 16.45% … 28.38%
 Time  (median):     1.832 s               ┊ GC (median):    16.99%
 Time  (mean ± σ):   1.797 s ± 195.888 ms  ┊ GC (mean ± σ):  21.00% ±  6.74%

  █                                   █                    █
  █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
  1.59 s         Histogram: frequency by time         1.97 s <

 Memory estimate: 3.19 GiB, allocs estimate: 128727.

@rocco_sprmnt21 solutions

Using tuples

np = Vector{NTuple{3,Float64}}()
@benchmark open(file) do f
    while !eof(f)
        readuntil(f, "ITEM: ATOMS id type xs ys zs\n")
        m = CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), CSV.Tables.matrix; delim=' ', header=false, ignorerepeated=true)
        np = [(r[3], r[4], r[5]) for r in eachrow(m)]
        # append!(positions,np )
    end
end


nchmarkTools.Trial: 8 samples with 1 evaluation.
 Range (min … max):  638.898 ms … 704.015 ms  ┊ GC (min … max): 0.40% … 0.71%
 Time  (median):     656.561 ms               ┊ GC (median):    0.65%
 Time  (mean ± σ):   661.835 ms ±  20.798 ms  ┊ GC (mean ± σ):  0.63% ± 0.19%

  ▁ ▁            ▁█        ▁        ▁                         ▁
  █▁█▁▁▁▁▁▁▁▁▁▁▁▁██▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
  639 ms           Histogram: frequency by time          704 ms <

 Memory estimate: 77.66 MiB, allocs estimate: 154600.

Using SVectors

using StaticArrays


currblock = Vector{SVector{3,Float64}}()
@benchmark open(filename) do f
    readuntil(f, "ITEM: ATOMS")
    while !eof(f)
        currblock = [SA[r[3], r[4], r[5]] for r in eachrow(CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), CSV.Tables.matrix; delim=' ', ignorerepeated=true))]
        # append!(positions,currblock )
        readuntil(f, "ITEM: ATOMS")
    end
end



BenchmarkTools.Trial: 18 samples with 1 evaluation.
 Range (min … max):  282.862 ms … 298.363 ms  ┊ GC (min … max): 0.00% … 0.57%
 Time  (median):     291.831 ms               ┊ GC (median):    0.56%
 Time  (mean ± σ):   291.904 ms ±   4.664 ms  ┊ GC (mean ± σ):  0.39% ± 0.33%

  ▁         ▁     ▁▁  ▁   ▁  █     ▁   ▁       ▁ ▁▁    █ ▁  ▁ ▁
  █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁██▁▁█▁▁▁█▁▁█▁▁▁▁▁█▁▁▁█▁▁▁▁▁▁▁█▁██▁▁▁▁█▁█▁▁█▁█ ▁
  283 ms           Histogram: frequency by time          298 ms <

 Memory estimate: 29.11 MiB, allocs estimate: 109039.

@djholiver Arrow example

using Arrow, CSV, DataFrames, BenchmarkTools

arrowFile = "lammpDump.arrow"

Base.@kwdef struct Lammps
    timestep = 0
    natoms = 0
    boxsize#::Matrix{Float64} = Array{Float64}(undef, 0, 0)
    positions#::Matrix{Float64} = Array{Float64}(undef, 0, 0)
end
ArrowTypes.arrowname(::Type{Lammps}) = :Lammps
ArrowTypes.JuliaType(::Val{:Lammps}) = Lammps
atomColumnMappings = Dict(("id" => 1), ("type" => 2), ("xs" => 3), ("ys" => 4), ("zs" => 5))


function getAtomItem(itemName, atomColumnMappings, positionColumn, numberOfAtoms)

    columnIndex = atomColumnMappings[itemName]
    indexStart = ((columnIndex - 1) * numberOfAtoms) + 1
    indexEnd = columnIndex * numberOfAtoms
    @view positionColumn[indexStart:indexEnd]

end
struct LammpsResolved
    timestep 
    natoms 
    boxsize 
    positions_id 
    positions_type 
    positions_xs 
    positions_ys 
    positions_zs 
    
     LammpsResolved(lammp , atomColumnMappings) = new(
        lammp.timestep,
        lammp.natoms,
        lammp.boxsize,
        getAtomItem("id", atomColumnMappings, lammp.positions, lammp.natoms),
        getAtomItem("type", atomColumnMappings, lammp.positions, lammp.natoms),
        getAtomItem("xs", atomColumnMappings, lammp.positions, lammp.natoms),
        getAtomItem("ys", atomColumnMappings, lammp.positions, lammp.natoms),
        getAtomItem("zs", atomColumnMappings, lammp.positions, lammp.natoms)
        )
end

function parseLammpToArrow(file, streamBatchSize=50)
    open(file) do f
        let writer = open(Arrow.Writer, arrowFile)
            local batchSize = streamBatchSize
            local batch = 0
            local firstBatch = true
            local timesteps = Vector{Int}()
            local lammpsCollection = Vector{Lammps}()

            while !eof(f)

                timeStep = parse(Int, split(readuntil(f, "ITEM: NUMBER OF ATOMS"), "\n")[2])
                natoms = parse(Int, split(readuntil(f, "ITEM: BOX BOUNDS pp pp pp"), "\n")[2])
                boxsize = Matrix{Float64}(CSV.read(IOBuffer(readuntil(f, "ITEM: ATOMS")), DataFrame, header=false))
                positions = Matrix{Float64}(CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), DataFrame; delim=' ', ignorerepeated=true))

                l = Lammps(timeStep, natoms, boxsize, positions)
                push!(timesteps, timeStep)
                push!(lammpsCollection, l)
                batch += 1
                if batch == batchSize
                    table = (timeStep=timesteps, lammps=lammpsCollection)
                    Arrow.write(writer, table)#,ntasks=1;maxdepth=100)
                    batch = 0
                    timesteps = Vector{Int}()
                    lammpsCollection = Vector{Lammps}()
                end
            end
            if length(timesteps) > 0
                table = (timeStep=timesteps, lammps=lammpsCollection)
                Arrow.write(writer, table)
            end
            close(writer)
        end
    end
end

parseLammpToArrow(filename)

a = Arrow.Table(arrowFile)

#can now access by column and index in < ms 
b = a[2]


#@code_warntype getAtomItem("id", atomColumnMappings, b[1].positions, b[1].natoms) 
function iterateB(b)
    atomColumnMappings :: Dict{String, Int} = Dict(("id" => 1), ("type" => 2), ("xs" => 3), ("ys" => 4), ("zs" => 5))
   
    for i in 1:length(b)
     l = LammpsResolved(b[i],atomColumnMappings)
    end
end
iterateB(a[2])
@benchmark iterateB(a[2])

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  20.500 μs …  2.437 ms  ┊ GC (min … max): 0.00% … 91.02%
 Time  (median):     24.700 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   38.492 μs ± 72.942 μs  ┊ GC (mean ± σ):  5.69% ±  3.76%

  █▇▆▄▃▂▂▂▃▂▁▂▁▂▂▂▂▂▁▁▁▁                                      ▁
  ███████████████████████▇▇▆▆▇▇▅▆▅▆▆▅▆▅▆▅▅▅▅▅▅▅▆▆▆▆▆▆▆▅▆▅▅▅▅▆ █
  20.5 μs      Histogram: log(frequency) by time       168 μs <

 Memory estimate: 78.64 KiB, allocs estimate: 804.



function iterateBThreaded(b)
    atomColumnMappings :: Dict{String, Int} = Dict(("id" => 1), ("type" => 2), ("xs" => 3), ("ys" => 4), ("zs" => 5))
    Base.Threads.@threads for i in 1:length(b)
        l = LammpsResolved(b[i] , atomColumnMappings)
    end
end

@show Base.Threads.nthreads()
@benchmark iterateBThreaded(a[2])
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  22.100 μs …  1.337 ms  ┊ GC (min … max): 0.00% … 94.74%
 Time  (median):     24.000 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   27.120 μs ± 35.097 μs  ┊ GC (mean ± σ):  4.89% ±  3.85%

  ▇██▇▄▄▃▂▂▁▂▁                                                ▂
  ██████████████▇▆▆▆▅▆▅██▆▆▆▇▇▆▅▆▆▆▄▄▄▄▅▅▆▅▆▅▅▄▅▄▅▅▄▆▆▄▄▆▆▅▄▅ █
  22.1 μs      Histogram: log(frequency) by time      65.6 μs <

 Memory estimate: 79.33 KiB, allocs estimate: 810.

@jules parsers (very fast)

using Parsers
using BenchmarkTools
using Mmap

@inline function skipline(io)
    skipchars(!=('\n'), io)
    skip(io, 1)
end

@inline function _parse(T, io)
    v = Parsers.parse(T, io)
    skip(io, 1) # either space or newline
    return v
end

struct Atom
    id::Int
    type::Int
    x::Float64
    y::Float64
    z::Float64
end

function read_step(io)
    skipline(io)
    timestep = _parse(Int, io)
    skipline(io)
    natoms = _parse(Int, io)
    skipline(io)
    box_bounds = ntuple(Val(3)) do _
        x = _parse(Float64, io)
        y = _parse(Float64, io)
        (; x, y)
    end
    skipline(io)
    atoms = Vector{Atom}(undef, natoms)
    for i in eachindex(atoms)
        id = _parse(Int, io)
        type = _parse(Int, io)
        x = _parse(Float64, io)
        y = _parse(Float64, io)
        z = _parse(Float64, io)
        atoms[i] = Atom(id, type, x, y, z)
    end
    (; timestep, natoms, box_bounds, atoms)
end

function read_lammps(file)
    steps = 100
    open(file, "r") do handle
        m = Mmap.mmap(handle)
        io = IOBuffer(m)
        for i in 1:steps
            data = read_step(io)

        end
    end
end

@benchmark read_lammps("generated_file.lammpstrj")

BenchmarkTools.Trial: 305 samples with 1 evaluation.
 Range (min … max):  14.076 ms … 23.616 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     15.816 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   16.374 ms ±  1.896 ms  ┊ GC (mean ± σ):  1.16% ± 3.21%

       █▆▄▃▃▃▆▂ ▂▁
  ▄▅▆▇█████████████▄▆▇▄▇▄▃▃▄▁▄▃▃▄▄▃▁▃▁▃▁▁▄▁▃▃▁▁▁▄▄▁▁▁▁▁▃▁▃▁▄▃ ▄
  14.1 ms         Histogram: frequency by time        23.1 ms <

 Memory estimate: 3.82 MiB, allocs estimate: 215.

Methods not tested yet:
SQlite, recommended by @mnemnion
DuckDB, recommended by @rdavis120

Hope I didn’t miss anything!

Topic		Replies	Views
Why DataFrames v.0.21.2 (julia v1.4.2) requires more memory than the previous version Performance dataframes	22	2282	June 29, 2020
.csv number of rows Data csv	6	3284	September 13, 2022
Reading a few rows from a BIG CSV file General Usage dataframes , csv , big-data	39	4526	January 18, 2024
How can I split large data using a faster and more efficient function (data science)? New to Julia csv	9	805	October 27, 2022
CSV.Row very slow for reading files line by line Performance package , csv	0	282	May 9, 2023

Skipping a lot of lines in CSV.read() allocates too much memory

Related topics