Skipping a lot of lines in CSV.read() allocates too much memory

,

I followed, as far as I could understand, the discussion on the possibilities and limitations of mmap.
For this reason and to gain experience with some aspects of IO, I tried to “play” with the following script which only uses the basic functions and the float parsing function as defined by @mkitti

no dependencies and seems very fast
julia> using BenchmarkTools

julia> file="IO_lammpstr.txt"
"IO_lammpstr.txt"

julia> function pmk(bytes::AbstractVector{UInt8}, _start::Int = 1, _end::Int = length(bytes))
           int_val = 0
           power = 0
           for i in _start:_end
               byte = bytes[i]
               if byte != 0x2e
                   int_val *= 10
                   int_val += byte - 0x30
                   power *= 10
               else
                   power = 1
               end
           end
           q = int_val / power 
           return q
       end
pmk (generic function with 3 methods)

julia> function fn(c,ch, s)
           for i in s:lastindex(ch)
               c==ch[i] && return i
           end
       end
fn (generic function with 1 method)

julia> function chnkpmk(ch)
           res=[(0.,0.,0.) for _ in 1:2000]
           ez=i=1
           while ez <length(ch)
               ei1=fn(0x20,ch,ez)
               ei2=fn(0x20,ch,ei1+1)
               ex=fn(0x20,ch,ei2+1)
               x=pmk(ch,ei2+1,ex-1)
               ey=fn(0x20,ch,ex+1)
               y=pmk(ch,ex+1,ey-1)
               ez=fn(0x0a,ch,ey)
               z=pmk(ch,ey+1,ez-1)
               res[i]=(x,y,z)
               i+=1
           end
           res
       end
chnkpmk (generic function with 1 method)

julia> function atomsb(buffer, L)
           from=codeunits("TEM: ATOMS id type xs ys zs")
           to=codeunits("ITEM: TIMESTEP")
           lto=length(to)
           append!(buffer,to)
           t=last(findfirst(to,buffer))
           f=last(findfirst(from,buffer))
           while true
               t=last(findnext(to,buffer,f))
               ch=@view buffer[f+2:t-lto]
               t>L&& (return f)
               chnkpmk(ch)
               f=last(findnext(from,buffer,t))
           end

       end
atomsb (generic function with 1 method)

julia> function fa(file, chunk=10^6)
           buffer=[0x0]
           io=open(file)
           pos=1
           while !eof(io)
           L=readbytes!(io, buffer,chunk)
           p=atomsb(buffer, L)
           pos+=p
           seek(io,pos)
           end
       end
fa (generic function with 2 methods)

julia>  @btime fa(file)
  24.145 ms (391 allocations: 10.73 MiB)

julia>  @btime fa(file,3*10^5)
  21.692 ms (318 allocations: 7.95 MiB)

julia>  @btime fa(file,10^5)
  8.434 ms (16 allocations: 247.91 KiB)

I’m not entirely sure that something isn’t lost in the transitions from one chunk to another. :laughing:
But since it’s very fast and shows how performance can vary depending on the size of the chin k, maybe it’s worth testing it a bit

below are some tests to see how many steps per chunk are “processed”

tests

function atomsbtest(buffer, L)
    from=codeunits("TEM: ATOMS id type xs ys zs")
    to=codeunits("ITEM: TIMESTEP")
    lto=length(to)
    append!(buffer,to)
    t=last(findfirst(to,buffer))
    f=last(findfirst(from,buffer))
    i=1
    while true
        t=last(findnext(to,buffer,f))
        ch=@view buffer[f+2:t-lto]
        t>L&& (return (f,i))
        chnkpmk(ch)
        i+=1
        f=last(findnext(from,buffer,t))
    end

end
function fatest(file, chunk=10^6)
    buffer=[0x0]
    io=open(file)
    cn=[]
    pos=1
    while !eof(io)
    L=readbytes!(io, buffer,chunk)
    (p,i)=atomsb(buffer, L)
    pos+=p
    push!(cn,i)
    seek(io,pos)
    end
    cnjulia> function atomsbtest(buffer, L)
           from=codeunits("TEM: ATOMS id type xs ys zs")
           to=codeunits("ITEM: TIMESTEP")
           lto=length(to)
           append!(buffer,to)
           t=last(findfirst(to,buffer))
           f=last(findfirst(from,buffer))
           i=1
           while true
               t=last(findnext(to,buffer,f))
               ch=@view buffer[f+2:t-lto]
               t>L&& (return (f,i))
               chnkpmk(ch)
               i+=1
               f=last(findnext(from,buffer,t))
           end

       end
atomsbtest (generic function with 1 method)

julia> function fatest(file, chunk=10^6)
           buffer=[0x0]
           io=open(file)
           cn=[]
           pos=1
           while !eof(io)
           L=readbytes!(io, buffer,chunk)
           (p,i)=atomsbtest(buffer, L)
           pos+=p
           push!(cn,i)
           seek(io,pos)
           end
           cn
       end
fatest (generic function with 2 methods)

julia> fatest(file)
15-element Vector{Any}:
 16
 15
 15
 15
 15
 15
 15
 15
 15
 15
 15
 15
 15
  5
  1

julia> fatest(file,3*10^5)
51-element Vector{Any}:
 5
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 ⋮
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 1

julia> fatest(file,10^5)
201-element Vector{Any}:
 2
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

This text will be hidden