Skipping a lot of lines in CSV.read() allocates too much memory

rocco_sprmnt21 · February 23, 2024, 11:30am

I followed, as far as I could understand, the discussion on the possibilities and limitations of mmap.
For this reason and to gain experience with some aspects of IO, I tried to “play” with the following script which only uses the basic functions and the float parsing function as defined by @mkitti

no dependencies and seems very fast

julia> using BenchmarkTools

julia> file="IO_lammpstr.txt"
"IO_lammpstr.txt"

julia> function pmk(bytes::AbstractVector{UInt8}, _start::Int = 1, _end::Int = length(bytes))
           int_val = 0
           power = 0
           for i in _start:_end
               byte = bytes[i]
               if byte != 0x2e
                   int_val *= 10
                   int_val += byte - 0x30
                   power *= 10
               else
                   power = 1
               end
           end
           q = int_val / power 
           return q
       end
pmk (generic function with 3 methods)

julia> function fn(c,ch, s)
           for i in s:lastindex(ch)
               c==ch[i] && return i
           end
       end
fn (generic function with 1 method)

julia> function chnkpmk(ch)
           res=[(0.,0.,0.) for _ in 1:2000]
           ez=i=1
           while ez <length(ch)
               ei1=fn(0x20,ch,ez)
               ei2=fn(0x20,ch,ei1+1)
               ex=fn(0x20,ch,ei2+1)
               x=pmk(ch,ei2+1,ex-1)
               ey=fn(0x20,ch,ex+1)
               y=pmk(ch,ex+1,ey-1)
               ez=fn(0x0a,ch,ey)
               z=pmk(ch,ey+1,ez-1)
               res[i]=(x,y,z)
               i+=1
           end
           res
       end
chnkpmk (generic function with 1 method)

julia> function atomsb(buffer, L)
           from=codeunits("TEM: ATOMS id type xs ys zs")
           to=codeunits("ITEM: TIMESTEP")
           lto=length(to)
           append!(buffer,to)
           t=last(findfirst(to,buffer))
           f=last(findfirst(from,buffer))
           while true
               t=last(findnext(to,buffer,f))
               ch=@view buffer[f+2:t-lto]
               t>L&& (return f)
               chnkpmk(ch)
               f=last(findnext(from,buffer,t))
           end

       end
atomsb (generic function with 1 method)

julia> function fa(file, chunk=10^6)
           buffer=[0x0]
           io=open(file)
           pos=1
           while !eof(io)
           L=readbytes!(io, buffer,chunk)
           p=atomsb(buffer, L)
           pos+=p
           seek(io,pos)
           end
       end
fa (generic function with 2 methods)

julia>  @btime fa(file)
  24.145 ms (391 allocations: 10.73 MiB)

julia>  @btime fa(file,3*10^5)
  21.692 ms (318 allocations: 7.95 MiB)

julia>  @btime fa(file,10^5)
  8.434 ms (16 allocations: 247.91 KiB)

I’m not entirely sure that something isn’t lost in the transitions from one chunk to another.
But since it’s very fast and shows how performance can vary depending on the size of the chin k, maybe it’s worth testing it a bit

below are some tests to see how many steps per chunk are “processed”

tests


function atomsbtest(buffer, L)
    from=codeunits("TEM: ATOMS id type xs ys zs")
    to=codeunits("ITEM: TIMESTEP")
    lto=length(to)
    append!(buffer,to)
    t=last(findfirst(to,buffer))
    f=last(findfirst(from,buffer))
    i=1
    while true
        t=last(findnext(to,buffer,f))
        ch=@view buffer[f+2:t-lto]
        t>L&& (return (f,i))
        chnkpmk(ch)
        i+=1
        f=last(findnext(from,buffer,t))
    end

end
function fatest(file, chunk=10^6)
    buffer=[0x0]
    io=open(file)
    cn=[]
    pos=1
    while !eof(io)
    L=readbytes!(io, buffer,chunk)
    (p,i)=atomsb(buffer, L)
    pos+=p
    push!(cn,i)
    seek(io,pos)
    end
    cnjulia> function atomsbtest(buffer, L)
           from=codeunits("TEM: ATOMS id type xs ys zs")
           to=codeunits("ITEM: TIMESTEP")
           lto=length(to)
           append!(buffer,to)
           t=last(findfirst(to,buffer))
           f=last(findfirst(from,buffer))
           i=1
           while true
               t=last(findnext(to,buffer,f))
               ch=@view buffer[f+2:t-lto]
               t>L&& (return (f,i))
               chnkpmk(ch)
               i+=1
               f=last(findnext(from,buffer,t))
           end

       end
atomsbtest (generic function with 1 method)

julia> function fatest(file, chunk=10^6)
           buffer=[0x0]
           io=open(file)
           cn=[]
           pos=1
           while !eof(io)
           L=readbytes!(io, buffer,chunk)
           (p,i)=atomsbtest(buffer, L)
           pos+=p
           push!(cn,i)
           seek(io,pos)
           end
           cn
       end
fatest (generic function with 2 methods)

julia> fatest(file)
15-element Vector{Any}:
 16
 15
 15
 15
 15
 15
 15
 15
 15
 15
 15
 15
 15
  5
  1

julia> fatest(file,3*10^5)
51-element Vector{Any}:
 5
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 ⋮
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 1

julia> fatest(file,10^5)
201-element Vector{Any}:
 2
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

This text will be hidden

Topic		Replies	Views
Why DataFrames v.0.21.2 (julia v1.4.2) requires more memory than the previous version Performance dataframes	22	2277	June 29, 2020
.csv number of rows Data csv	6	3273	September 13, 2022
Reading a few rows from a BIG CSV file General Usage dataframes , csv , big-data	39	4508	January 18, 2024
How can I split large data using a faster and more efficient function (data science)? New to Julia csv	9	796	October 27, 2022
CSV.Row very slow for reading files line by line Performance package , csv	0	280	May 9, 2023

Skipping a lot of lines in CSV.read() allocates too much memory

Related topics