Here is a summary of some of the solutions presented so far in this thread.
Most refer to the lammps “dump file” format, so here is a code that generates an identical file, for reproducibility.
Creating file
# CREATING FILE
filename = "generated_file.lammpstrj"
steps::Int64 = 1e2
natoms::Int64 = 1e3
open(filename, "w") do f
for i in 1:steps
println(f, "ITEM: TIMESTEP")
println(f, "$i")
println(f, "ITEM: NUMBER OF ATOMS")
println(f, "$natoms")
println(f, "ITEM: BOX BOUNDS pp pp pp")
println(f, "$(rand()) $(rand())")
println(f, "$(rand()) $(rand())")
println(f, "$(rand()) $(rand())")
println(f, "ITEM: ATOMS id type xs ys zs")
for j in 1:natoms
println(f, "$j 1 $(rand()) $(rand()) $(rand())")
end
end
end
Here are some ways to read the data from the file.
CSV skiplines loop
#Too slow, dont even bother!
for i in 1:steps
boxlengths = CSV.read(filename, DataFrame, skipto=floor(Int, (i - 1) * (natoms + 9) + 6), limit=natoms, header=false, delim=' ')
positions = CSV.read(filename, DataFrame, skipto=floor(Int, (i - 1) * (natoms + 9) + 10), limit=natoms, header=false, delim=' ')
end
Numpy.loadtxt
np = pyimport("numpy")
@benchmark for i in 1:steps
boxlengths = pyconvert(Matrix, np.loadtxt(filename, skiprows=floor(Int,(i - 1) * (natoms + 9) + 5), max_rows=3, delimiter=' '))
positions = pyconvert(Matrix, np.loadtxt(filename, skiprows=floor(Int, (i - 1) * (natoms + 9) + 9), max_rows=natoms, delimiter=' '))
end
BenchmarkTools.Trial: 3 samples with 1 evaluation.
Range (min … max): 1.958 s … 1.970 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 1.963 s ┊ GC (median): 0.00%
Time (mean ± σ): 1.964 s ± 5.719 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
1.96 s Histogram: frequency by time 1.97 s <
Memory estimate: 4.12 MiB, allocs estimate: 9196.
read line by line, parse to Vector{Vector{Float}}
boxlengthsv::Vector{Float64} = zeros(3)
positionsv::Vector{Vector{Float64}} = [zeros(3) for _ in 1:natoms]
@benchmark open(filename) do io
for s in 1:steps
# Skip headers
for _ in 1:5
readline(io)
end
# Read box bounds
for i in 1:3
boxlengthsv[i] = sum(abs.(parse.(Float64, split(readline(io), ' '))))
end
# Skip header
readline(io)
# Read positions
for i in 1:natoms
positionsv[i] = parse.(Float64, split(readline(io), ' '))[3:5]
end
#Do calculations here
end
end
BenchmarkTools.Trial: 21 samples with 1 evaluation.
Range (min … max): 238.321 ms … 251.300 ms ┊ GC (min … max): 1.62% … 2.71%
Time (median): 243.535 ms ┊ GC (median): 1.48%
Time (mean ± σ): 243.903 ms ± 3.424 ms ┊ GC (mean ± σ): 1.88% ± 0.78%
▁ ▁ ▁ ▁▁ ▁▁ ▁▁ ▁▁ ▁ ▁ █ ▁ ▁ ▁▁ ▁ ▁
█▁▁█▁▁▁▁▁█▁██▁██▁▁██▁▁▁██▁█▁▁█▁▁█▁▁█▁▁▁█▁▁▁▁██▁▁▁▁▁█▁▁▁▁▁▁▁▁█ ▁
238 ms Histogram: frequency by time 251 ms <
Memory estimate: 50.70 MiB, allocs estimate: 502007.
@SteffenPL method
@benchmark open(filename) do f
while !eof(f)
readuntil(f, "ITEM: BOX BOUNDS")
boxsize = CSV.read(IOBuffer(readuntil(f, "ITEM: ATOMS")), DataFrame, header=false, skipto=2)
positions = CSV.read(IOBuffer(readuntil(f, "ITEM: ")), DataFrame, header=false, skipto=2)
end
end
BenchmarkTools.Trial: 12 samples with 1 evaluation.
Range (min … max): 297.871 ms … 617.403 ms ┊ GC (min … max): 0.00% … 1.59%
Time (median): 423.428 ms ┊ GC (median): 0.69%
Time (mean ± σ): 423.796 ms ± 88.632 ms ┊ GC (mean ± σ): 0.84% ± 0.77%
█ █ █ █ █ █ ███ █ █ █
█▁█▁▁▁▁█▁▁▁▁▁▁▁▁▁▁█▁█▁█▁▁███▁▁▁▁▁▁█▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
298 ms Histogram: frequency by time 617 ms <
Memory estimate: 26.47 MiB, allocs estimate: 142004.
@rafael.guerra solution using a structure
Base.@kwdef mutable struct MyData
timestep::Vector{Int} = Int[]
natoms::Int = 0
boxsize::Matrix{Float64} = Array{Float64}(undef, 0, 0)
positions::DataFrame = DataFrame()
end
M = MyData()
# Read repeated items only once:
open(filename) do f
readuntil(f, "ITEM: NUMBER OF ATOMS")
M.natoms = parse(Int, readuntil(f, "ITEM: BOX BOUNDS pp pp pp"))
M.boxsize = Matrix{Float64}(CSV.read(IOBuffer(readuntil(f, "ITEM: ATOMS")), DataFrame, header=false))
end
# Read varying items:
@benchmark open(filename) do f
readline(f)
while !eof(f)
push!(M.timestep, parse(Int, readuntil(f, "ITEM: NUMBER OF ATOMS")))
readuntil(f, "ITEM: ATOMS")
M.positions = vcat(M.positions, CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), DataFrame; delim=' ', ignorerepeated=true))
end
end
# Access loaded data with:
M.timestep
M.natoms
M.boxsize
M.positions
M.positions.zs
BenchmarkTools.Trial: 3 samples with 1 evaluation.
Range (min … max): 1.586 s … 1.973 s ┊ GC (min … max): 16.45% … 28.38%
Time (median): 1.832 s ┊ GC (median): 16.99%
Time (mean ± σ): 1.797 s ± 195.888 ms ┊ GC (mean ± σ): 21.00% ± 6.74%
█ █ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
1.59 s Histogram: frequency by time 1.97 s <
Memory estimate: 3.19 GiB, allocs estimate: 128727.
@rocco_sprmnt21 solutions
Using tuples
np = Vector{NTuple{3,Float64}}()
@benchmark open(file) do f
while !eof(f)
readuntil(f, "ITEM: ATOMS id type xs ys zs\n")
m = CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), CSV.Tables.matrix; delim=' ', header=false, ignorerepeated=true)
np = [(r[3], r[4], r[5]) for r in eachrow(m)]
# append!(positions,np )
end
end
nchmarkTools.Trial: 8 samples with 1 evaluation.
Range (min … max): 638.898 ms … 704.015 ms ┊ GC (min … max): 0.40% … 0.71%
Time (median): 656.561 ms ┊ GC (median): 0.65%
Time (mean ± σ): 661.835 ms ± 20.798 ms ┊ GC (mean ± σ): 0.63% ± 0.19%
▁ ▁ ▁█ ▁ ▁ ▁
█▁█▁▁▁▁▁▁▁▁▁▁▁▁██▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
639 ms Histogram: frequency by time 704 ms <
Memory estimate: 77.66 MiB, allocs estimate: 154600.
Using SVectors
using StaticArrays
currblock = Vector{SVector{3,Float64}}()
@benchmark open(filename) do f
readuntil(f, "ITEM: ATOMS")
while !eof(f)
currblock = [SA[r[3], r[4], r[5]] for r in eachrow(CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), CSV.Tables.matrix; delim=' ', ignorerepeated=true))]
# append!(positions,currblock )
readuntil(f, "ITEM: ATOMS")
end
end
BenchmarkTools.Trial: 18 samples with 1 evaluation.
Range (min … max): 282.862 ms … 298.363 ms ┊ GC (min … max): 0.00% … 0.57%
Time (median): 291.831 ms ┊ GC (median): 0.56%
Time (mean ± σ): 291.904 ms ± 4.664 ms ┊ GC (mean ± σ): 0.39% ± 0.33%
▁ ▁ ▁▁ ▁ ▁ █ ▁ ▁ ▁ ▁▁ █ ▁ ▁ ▁
█▁▁▁▁▁▁▁▁▁█▁▁▁▁▁██▁▁█▁▁▁█▁▁█▁▁▁▁▁█▁▁▁█▁▁▁▁▁▁▁█▁██▁▁▁▁█▁█▁▁█▁█ ▁
283 ms Histogram: frequency by time 298 ms <
Memory estimate: 29.11 MiB, allocs estimate: 109039.
@djholiver Arrow example
using Arrow, CSV, DataFrames, BenchmarkTools
arrowFile = "lammpDump.arrow"
Base.@kwdef struct Lammps
timestep = 0
natoms = 0
boxsize#::Matrix{Float64} = Array{Float64}(undef, 0, 0)
positions#::Matrix{Float64} = Array{Float64}(undef, 0, 0)
end
ArrowTypes.arrowname(::Type{Lammps}) = :Lammps
ArrowTypes.JuliaType(::Val{:Lammps}) = Lammps
atomColumnMappings = Dict(("id" => 1), ("type" => 2), ("xs" => 3), ("ys" => 4), ("zs" => 5))
function getAtomItem(itemName, atomColumnMappings, positionColumn, numberOfAtoms)
columnIndex = atomColumnMappings[itemName]
indexStart = ((columnIndex - 1) * numberOfAtoms) + 1
indexEnd = columnIndex * numberOfAtoms
@view positionColumn[indexStart:indexEnd]
end
struct LammpsResolved
timestep
natoms
boxsize
positions_id
positions_type
positions_xs
positions_ys
positions_zs
LammpsResolved(lammp , atomColumnMappings) = new(
lammp.timestep,
lammp.natoms,
lammp.boxsize,
getAtomItem("id", atomColumnMappings, lammp.positions, lammp.natoms),
getAtomItem("type", atomColumnMappings, lammp.positions, lammp.natoms),
getAtomItem("xs", atomColumnMappings, lammp.positions, lammp.natoms),
getAtomItem("ys", atomColumnMappings, lammp.positions, lammp.natoms),
getAtomItem("zs", atomColumnMappings, lammp.positions, lammp.natoms)
)
end
function parseLammpToArrow(file, streamBatchSize=50)
open(file) do f
let writer = open(Arrow.Writer, arrowFile)
local batchSize = streamBatchSize
local batch = 0
local firstBatch = true
local timesteps = Vector{Int}()
local lammpsCollection = Vector{Lammps}()
while !eof(f)
timeStep = parse(Int, split(readuntil(f, "ITEM: NUMBER OF ATOMS"), "\n")[2])
natoms = parse(Int, split(readuntil(f, "ITEM: BOX BOUNDS pp pp pp"), "\n")[2])
boxsize = Matrix{Float64}(CSV.read(IOBuffer(readuntil(f, "ITEM: ATOMS")), DataFrame, header=false))
positions = Matrix{Float64}(CSV.read(IOBuffer(readuntil(f, "ITEM: TIMESTEP")), DataFrame; delim=' ', ignorerepeated=true))
l = Lammps(timeStep, natoms, boxsize, positions)
push!(timesteps, timeStep)
push!(lammpsCollection, l)
batch += 1
if batch == batchSize
table = (timeStep=timesteps, lammps=lammpsCollection)
Arrow.write(writer, table)#,ntasks=1;maxdepth=100)
batch = 0
timesteps = Vector{Int}()
lammpsCollection = Vector{Lammps}()
end
end
if length(timesteps) > 0
table = (timeStep=timesteps, lammps=lammpsCollection)
Arrow.write(writer, table)
end
close(writer)
end
end
end
parseLammpToArrow(filename)
a = Arrow.Table(arrowFile)
#can now access by column and index in < ms
b = a[2]
#@code_warntype getAtomItem("id", atomColumnMappings, b[1].positions, b[1].natoms)
function iterateB(b)
atomColumnMappings :: Dict{String, Int} = Dict(("id" => 1), ("type" => 2), ("xs" => 3), ("ys" => 4), ("zs" => 5))
for i in 1:length(b)
l = LammpsResolved(b[i],atomColumnMappings)
end
end
iterateB(a[2])
@benchmark iterateB(a[2])
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 20.500 μs … 2.437 ms ┊ GC (min … max): 0.00% … 91.02%
Time (median): 24.700 μs ┊ GC (median): 0.00%
Time (mean ± σ): 38.492 μs ± 72.942 μs ┊ GC (mean ± σ): 5.69% ± 3.76%
█▇▆▄▃▂▂▂▃▂▁▂▁▂▂▂▂▂▁▁▁▁ ▁
███████████████████████▇▇▆▆▇▇▅▆▅▆▆▅▆▅▆▅▅▅▅▅▅▅▆▆▆▆▆▆▆▅▆▅▅▅▅▆ █
20.5 μs Histogram: log(frequency) by time 168 μs <
Memory estimate: 78.64 KiB, allocs estimate: 804.
function iterateBThreaded(b)
atomColumnMappings :: Dict{String, Int} = Dict(("id" => 1), ("type" => 2), ("xs" => 3), ("ys" => 4), ("zs" => 5))
Base.Threads.@threads for i in 1:length(b)
l = LammpsResolved(b[i] , atomColumnMappings)
end
end
@show Base.Threads.nthreads()
@benchmark iterateBThreaded(a[2])
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 22.100 μs … 1.337 ms ┊ GC (min … max): 0.00% … 94.74%
Time (median): 24.000 μs ┊ GC (median): 0.00%
Time (mean ± σ): 27.120 μs ± 35.097 μs ┊ GC (mean ± σ): 4.89% ± 3.85%
▇██▇▄▄▃▂▂▁▂▁ ▂
██████████████▇▆▆▆▅▆▅██▆▆▆▇▇▆▅▆▆▆▄▄▄▄▅▅▆▅▆▅▅▄▅▄▅▅▄▆▆▄▄▆▆▅▄▅ █
22.1 μs Histogram: log(frequency) by time 65.6 μs <
Memory estimate: 79.33 KiB, allocs estimate: 810.
@jules parsers (very fast)
using Parsers
using BenchmarkTools
using Mmap
@inline function skipline(io)
skipchars(!=('\n'), io)
skip(io, 1)
end
@inline function _parse(T, io)
v = Parsers.parse(T, io)
skip(io, 1) # either space or newline
return v
end
struct Atom
id::Int
type::Int
x::Float64
y::Float64
z::Float64
end
function read_step(io)
skipline(io)
timestep = _parse(Int, io)
skipline(io)
natoms = _parse(Int, io)
skipline(io)
box_bounds = ntuple(Val(3)) do _
x = _parse(Float64, io)
y = _parse(Float64, io)
(; x, y)
end
skipline(io)
atoms = Vector{Atom}(undef, natoms)
for i in eachindex(atoms)
id = _parse(Int, io)
type = _parse(Int, io)
x = _parse(Float64, io)
y = _parse(Float64, io)
z = _parse(Float64, io)
atoms[i] = Atom(id, type, x, y, z)
end
(; timestep, natoms, box_bounds, atoms)
end
function read_lammps(file)
steps = 100
open(file, "r") do handle
m = Mmap.mmap(handle)
io = IOBuffer(m)
for i in 1:steps
data = read_step(io)
end
end
end
@benchmark read_lammps("generated_file.lammpstrj")
BenchmarkTools.Trial: 305 samples with 1 evaluation.
Range (min … max): 14.076 ms … 23.616 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 15.816 ms ┊ GC (median): 0.00%
Time (mean ± σ): 16.374 ms ± 1.896 ms ┊ GC (mean ± σ): 1.16% ± 3.21%
█▆▄▃▃▃▆▂ ▂▁
▄▅▆▇█████████████▄▆▇▄▇▄▃▃▄▁▄▃▃▄▄▃▁▃▁▃▁▁▄▁▃▃▁▁▁▄▄▁▁▁▁▁▃▁▃▁▄▃ ▄
14.1 ms Histogram: frequency by time 23.1 ms <
Memory estimate: 3.82 MiB, allocs estimate: 215.
Methods not tested yet:
SQlite, recommended by @mnemnion
DuckDB, recommended by @rdavis120
Hope I didn’t miss anything!