How to do custom allocators

In the everlasting quest to reduce allocations I stumbled about problems with node based data structures. Constructing a test case based on DataStructures.jl’s singly linked list I tried to eliminate allocations in construction and iteration. After understanding push!/pop! always does a ccall · Issue #24909 · JuliaLang/julia · GitHub I have the following code

import Base.iterate
import Base.getindex
import Base.empty!
using BenchmarkTools

abstract type List{T}
end
struct Nil{T} <: List{T}
end
struct Cons{T} <: List{T}
    car::T
    cdr::LT where LT <: List{T}
end

function nil(t::T) where T
    Nil{T}()
end
function cons(car::T, cdr) where T
    Cons{T}(car, cdr)
end

function iterate(::Cons{T}, cdr::Nil{T}) where T
    nothing
end
function iterate(list::Nil{T}) where T
    nothing
end
function iterate(::Cons{T}, cdr::Cons{T}) where T
    cdr.car, cdr.cdr
end
function iterate(list::Cons{T}) where T
    list.car, list.cdr
end

mutable struct Allocator{T, I}
    store::Vector{T}
    current::I
    function Allocator{T, I}(n) where {T, I}
        new{T, I}(Vector{T}(undef, n), 0)
    end
end

function allocate!(allocator::Allocator{T, I}, t::T) where {T, I}
    allocator.current += 1
    allocator.store[allocator.current] = t
    allocator.current
end

function getindex(allocator::Allocator{T, I}, i::I) where {T, I}
    allocator.store[i]
end

function empty!(allocator::Allocator{T, I}) where {T, I}
    allocator.current = 0
end

function nil(t::T, allocator::Allocator{Tuple{T, I}, I}) where {T, I}
    allocate!(allocator, (t, 0))
end
function cons(car::T, cdr, allocator::Allocator{Tuple{T, I}, I}) where {T, I}
    allocate!(allocator, (car, cdr))
end

struct ListIterator{T, I}
    i::I
    allocator::Allocator{Tuple{T, I}, I}
    function ListIterator{T, I}(i::I, allocator::Allocator{Tuple{T, I}, I}) where {T, I}
        new{T, I}(i, allocator)
    end
end

function iterate(list::ListIterator{T, I}, i) where {T, I}
    if i == 0
        nothing
    else
        list.allocator[i]
    end
end
function iterate(list::ListIterator{T, I}) where {T, I}
    if list.i == 0
        nothing
    else
        list.allocator[list.i]
    end
end

function test1()
    l = nil(0)
    for i in 1:1000
        for j in 1:1000
            l = cons(j, l)
        end
        s = 0
        for t in l
            s += t
        end
        @assert s == 1000 * 1001 / 2
        l = nil(0)
    end
    l
end

function test2()
    allocator = Allocator{Tuple{Int, Int}, Int}(1001)
    l = nil(0, allocator)
    for i in 1:1000
        for i in 1:1000
            l = cons(i, l, allocator)
        end
        s = 0
        for t in ListIterator{Int, Int}(l, allocator)
            s += t
        end
        @assert s == 1000 * 1001 / 2
        empty!(allocator)
        l = nil(0, allocator)
    end
    l
end

@btime test1()
@btime test2()
println()

with the result

  83.025 ms (3977000 allocations: 106.43 MiB)
  2.246 ms (1 allocation: 15.81 KiB)

Do you see any chance to generalize this kind of optimization or is this a case specific one? I would be interested in allocating Nil{T} and Cons{T} directly in the backing store but my attempts failed.

Not answering your question, but isn’t cdr here abstract, thus complicating everything? Shouldn’t it be better if you parameterized LT as well?

Thanks Leandro for your observation,

replacing Nil{T} and Cons{T} with the original definition from DataStructures.jl

mutable struct Nil{T} <: List{T}
end
mutable struct Cons{T} <: List{T}
    car::T
    cdr::List{T}
end

yields the result

  65.883 ms (2979001 allocations: 75.97 MiB)
  2.421 ms (1 allocation: 15.81 KiB)

Interestingly defining them immutable is a bit slower.

That is still abstract (I think). You should probably use something like:

Thanks for the clarification, replacing the simply linked list code with

abstract type List{T}
end
mutable struct Nil{T} <: List{T}
end
mutable struct Cons{T, LT} <: List{T}
    car::T
    cdr::LT
end

function nil(t::T) where T
    Nil{T}()
end
function cons(car::T, cdr::LT) where {T, LT}
    Cons{T, LT}(car, cdr)
end

function iterate(::Cons{T, LT}, cdr::Nil{T}) where {T, LT}
    nothing
end
function iterate(list::Nil{T}) where T
    nothing
end
function iterate(::Cons{T, LT1}, cdr::Cons{T, LT2}) where {T, LT1, LT2}
    cdr.car, cdr.cdr
end
function iterate(list::Cons{T, LT})  where {T, LT}
    list.car, list.cdr
end

yields

  622.273 ms (2979001 allocations: 75.97 MiB)
  2.423 ms (1 allocation: 15.81 KiB)
1 Like

I’m sorry if I’m just creating noise here. The only reason I see for test1 to allocate is a type instability associated with l changing from being of type Nil to type Cons there.

I’m not completely sure because I’m not with the computer here, though.

Note that this will essentially end up being like a Tuple but maybe slower, because you’ll end up encoding the length of the linked list recursively in the type signature.

Thanks Leandro,

profiling the code gives no indication of excessive dynamic dispatch. I believe the difference in timings is more akin to the difference between StaticArrays and Arrays. If I print the type of l I get

Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, 
Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, 
Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, 
Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, 
Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, 
Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Cons{Int64, Nil{Int64}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
1 Like

I fixed some bugs, found a slight stylistic improvement and added tests for strings

import Base.iterate
import Base.getindex
import Base.empty!
using BenchmarkTools

mutable struct Allocator{T, I}
    store::Vector{T}
    current::I
    function Allocator{T, I}(n) where {T, I}
        new{T, I}(Vector{T}(undef, n), 0)
    end
end

function allocate!(allocator::Allocator{T, I}, t::T) where {T, I}
    allocator.current += 1
    allocator.store[allocator.current] = t
    allocator.current
end

function getindex(allocator::Allocator{T, I}, i::I) where {T, I}
    allocator.store[i]
end

function empty!(allocator::Allocator{T, I}) where {T, I}
    allocator.current = 0
end

abstract type List{T}
end
mutable struct Nil{T} <: List{T}
    function Nil{T}(t::T, allocator) where T
        allocate!(allocator, (t, 0))
    end
end
mutable struct Cons{T} <: List{T}
    car::T
    cdr::List{T}
    function Cons{T}(car::T, cdr) where T
        new{T}(car, cdr)
    end
    function Cons{T}(car::T, cdr, allocator) where T
        allocate!(allocator, (car, cdr))
    end
end

const Node{T, I} = Tuple{T, I}

function nil(t::T) where T
    Nil{T}()
end
function cons(car::T, cdr) where T
    Cons{T}(car, cdr)
end

function iterate(::Cons{T}, cdr::Nil{T}) where T
    nothing
end
function iterate(list::Nil{T}) where T
    nothing
end
function iterate(::Cons{T}, cdr::Cons{T}) where T
    cdr.car, cdr.cdr
end
function iterate(list::Cons{T}) where T
    list.car, list.cdr
end

function nil(t::T, allocator::Allocator{Node{T, I}, I}) where {T, I}
    Nil{T}(t::T, allocator)
end
function cons(car::T, cdr, allocator::Allocator{Node{T, I}, I}) where {T, I}
    Cons{T}(car, cdr, allocator)
end

struct ListIterator{T, I}
    i::I
    allocator::Allocator{Node{T, I}, I}
    function ListIterator{T, I}(i::I, allocator::Allocator{Node{T, I}, I}) where {T, I}
        new{T, I}(i, allocator)
    end
end

function iterate(list::ListIterator{T, I}, i) where {T, I}
    if list.allocator[i][2] == 0
        nothing
    else
        list.allocator[i]
    end
end
function iterate(list::ListIterator{T, I}) where {T, I}
    if list.allocator[list.i][2] == 0
        nothing
    else
        list.allocator[list.i]
    end
end

function test1()
    l = nil(0)
    for i in 1:1000
        for j in 1:1000
            l = cons(j, l)
        end
        s = 0
        for t in l
            s += t
        end
        @assert s == 1000 * 1001 / 2
        l = nil(0)
    end
    l
end

function test2()
    allocator = Allocator{Node{Int, Int}, Int}(1001)
    l = nil(0, allocator)
    for i in 1:1000
        for j in 1:1000
            l = cons(j, l, allocator)
        end
        s = 0
        for t in ListIterator{Int, Int}(l, allocator)
            s += t
        end
        @assert s == 1000 * 1001 / 2
        empty!(allocator)
        l = nil(0, allocator)
    end
    l
end

function test3()
    l = nil("")
    for i in 1:1000
        for j in 1:1000
            l = cons(string(j), l)
        end
        s = 0
        for t in l
            s += parse(Int, t)
        end
        @assert s == 1000 * 1001 / 2
        l = nil("")
    end
    l
end

function test4()
    allocator = Allocator{Node{String, Int}, Int}(1001)
    l = nil("", allocator)
    for i in 1:1000
        for j in 1:1000
            l = cons(string(j), l, allocator)
        end
        s = 0
        for t in ListIterator{String, Int}(l, allocator)
            s += parse(Int, t)
        end
        @assert s == 1000 * 1001 / 2
        empty!(allocator)
        l = nil("", allocator)
    end
    l
end

@btime test1()
@btime test2()
@btime test3()
@btime test4()
println()

resulting in

  67.187 ms (2979001 allocations: 75.97 MiB)
  2.093 ms (1 allocation: 15.81 KiB)
  116.661 ms (4001001 allocations: 152.60 MiB)
  65.889 ms (3000002 allocations: 122.09 MiB)

Now I’d only like to improve the iteration.

Fixing a regression and improving iterators results in the code

import Base.iterate
import Base.getindex
import Base.empty!
using BenchmarkTools

mutable struct Allocator{T, I}
    store::Vector{T}
    current::I
    function Allocator{T, I}(n) where {T, I}
        new{T, I}(Vector{T}(undef, n), 0)
    end
end

function allocate!(allocator::Allocator{T, I}, t::T) where {T, I}
    allocator.current += 1
    allocator.store[allocator.current] = t
    allocator.current, allocator
end

function getindex(allocator::Allocator{T, I}, i::I) where {T, I}
    allocator.store[i]
end

function empty!(allocator::Allocator{T, I}) where {T, I}
    allocator.current = 0
end

const Node{T, I} = Tuple{T, I}

struct NodeIterator{T, I}
    i::I
    allocator::Allocator{Node{T, I}, I}
    function NodeIterator{T, I}(i::I, allocator::Allocator{Node{T, I}, I}) where {T, I}
        new{T, I}(i, allocator)
    end
end

abstract type List{T}
end
mutable struct Nil{T} <: List{T}
    function Nil{T}() where T
        new{T}()
    end
end
function Nil{T}(t::T, allocator::Allocator{Node{T, I}, I}) where {T, I}
    NodeIterator{T, I}(allocate!(allocator, (t, 0))...)
end
mutable struct Cons{T} <: List{T}
    car::T
    cdr::List{T}
    function Cons{T}(car::T, cdr) where T
        new{T}(car, cdr)
    end
end
function Cons{T}(car::T, cdr, allocator::Allocator{Node{T, I}, I}) where {T, I}
    NodeIterator{T, I}(allocate!(allocator, (car, cdr.i))...)
end

function nil(t::T) where T
    Nil{T}()
end
function cons(car::T, cdr) where T
    Cons{T}(car, cdr)
end

function iterate(::Cons{T}, cdr::Nil{T}) where T
    nothing
end
function iterate(list::Nil{T}) where T
    nothing
end
function iterate(::Cons{T}, cdr::Cons{T}) where T
    cdr.car, cdr.cdr
end
function iterate(list::Cons{T}) where T
    list.car, list.cdr
end

function nil(t::T, allocator::Allocator{Node{T, I}, I}) where {T, I}
    Nil{T}(t::T, allocator)
end
function cons(car::T, cdr, allocator::Allocator{Node{T, I}, I}) where {T, I}
    Cons{T}(car, cdr, allocator)
end

function iterate(node::NodeIterator{T, I}, i) where {T, I}
    if node.allocator[i][2] == 0
        nothing
    else
        node.allocator[i]
    end
end
function iterate(node::NodeIterator{T, I}) where {T, I}
    if node.allocator[node.i][2] == 0
        nothing
    else
        node.allocator[node.i]
    end
end

function test1()
    l = nil(0)
    for i in 1:1000
        for j in 1:1000
            l = cons(j, l)
        end
        s = 0
        for t in l
            s += t
        end
        @assert s == 1000 * 1001 / 2
        l = nil(0)
    end
    l
end

function test2()
    allocator = Allocator{Node{Int, Int}, Int}(1001)
    l = nil(0, allocator)
    for i in 1:1000
        for j in 1:1000
            l = cons(j, l, allocator)
        end
        s = 0
        for t in l
            s += t
        end
        @assert s == 1000 * 1001 / 2
        empty!(allocator)
        l = nil(0, allocator)
    end
    l
end

function test3()
    l = nil("")
    for i in 1:1000
        for j in 1:1000
            l = cons(string(j), l)
        end
        s = 0
        for t in l
            s += parse(Int, t)
        end
        @assert s == 1000 * 1001 / 2
        l = nil("")
    end
    l
end

function test4()
    allocator = Allocator{Node{String, Int}, Int}(1001)
    l = nil("", allocator)
    for i in 1:1000
        for j in 1:1000
            l = cons(string(j), l, allocator)
        end
        s = 0
        for t in l
            s += parse(Int, t)
        end
        @assert s == 1000 * 1001 / 2
        empty!(allocator)
        l = nil("", allocator)
    end
    l
end

@btime test1()
@btime test2()
@btime test3()
@btime test4()
println()

with similar runtime performance. This looks reasonable to me.

I may be missing something but isn’t a simple definition already allocation-free on iteration?

struct NIL end

mutable struct Cons{T}
    car::T
    cdr::Union{NIL,Cons{T}}
end

const nil = NIL()
const List{T} = Union{NIL, Cons{T}}

car(x::Cons) = x.car
cdr(x::Cons) = x.cdr
cons(car::T, cdr::NIL) where {T} = Cons{T}(car, cdr)
cons(car, cdr::Cons{T}) where {T} = Cons{T}(car, cdr)

Base.iterate(list::List, rest=list) = rest === nil ? nothing : (car(rest), cdr(rest))

# benchmarking constructor
@benchmark foldl((cdr, car) -> cons(car, cdr), 1:1000; init=nil)
# 1000 allocs

# benchmarking iteration
list = foldl((cdr, car) -> cons(car, cdr), 1:1000; init=nil)
@benchmark sum($list)
# 0 allocs
1 Like

Hi Vasily,

thanks for checking this out. The last step was getting rid of the iterator wrapper in user code

using BenchmarkTools

mutable struct Allocator{T, I}
    store::Vector{T}
    current::I
    function Allocator{T, I}(n) where {T, I}
        new{T, I}(Vector{T}(undef, n), 0)
    end
end

function allocate!(allocator::Allocator{T, I}, t::T) where {T, I}
    allocator.current += 1
    allocator.store[allocator.current] = t
    allocator.current, allocator
end

function Base.getindex(allocator::Allocator{T, I}, i::I) where {T, I}
    allocator.store[i]
end

function Base.empty!(allocator::Allocator{T, I}) where {T, I}
    allocator.current = 0
end

const Node{T, I} = Tuple{T, I}

struct NodeIterator{T, I}
    i::I
    allocator::Allocator{Node{T, I}, I}
    function NodeIterator{T, I}(i::I, allocator::Allocator{Node{T, I}, I}) where {T, I}
        new{T, I}(i, allocator)
    end
end

mutable struct Nil{T}
    function Nil{T}() where T
        new{T}()
    end
end
mutable struct Cons{T}
    car::T
    cdr::Union{Nil{T},Cons{T}}
end

const List{T} = Union{Nil{T}, Cons{T}}

nil(t::T) where {T} = Nil{T}()
car(x::Cons) = x.car
cdr(x::Cons) = x.cdr
cons(car::T, cdr::Nil{T}) where {T} = Cons{T}(car, cdr)
cons(car, cdr::Cons{T}) where {T} = Cons{T}(car, cdr)

function nil(t::T, allocator::Allocator{Node{T, I}, I}) where {T, I}
    NodeIterator{T, I}(allocate!(allocator, (t, 0))...)
end
function cons(car::T, cdr, allocator::Allocator{Node{T, I}, I}) where {T, I}
    NodeIterator{T, I}(allocate!(allocator, (car, cdr.i))...)
end

Base.iterate(list::List, rest=list) = rest === nil ? nothing : (car(rest), cdr(rest))

function Base.iterate(node::NodeIterator{T, I}, i) where {T, I}
    if node.allocator[i][2] == 0
        nothing
    else
        node.allocator[i]
    end
end
function Base.iterate(node::NodeIterator{T, I}) where {T, I}
    if node.allocator[node.i][2] == 0
        nothing
    else
        node.allocator[node.i]
    end
end

# benchmarking constructor
@btime (allocator = Allocator{Node{Int, Int}, Int}(1001);
            foldl((cdr, car) -> cons(car, cdr, allocator), 1:1000; init=nil(0, allocator)))
# 2 allocs

# benchmarking iteration
allocator = Allocator{Node{Int, Int}, Int}(1001)
list = foldl((cdr, car) -> cons(car, cdr, allocator), 1:1000; init=nil(0, allocator))
@btime sum($list)
# 0 allocs

with the result

  1.320 μs (2 allocations: 15.84 KiB)
  1.720 μs (0 allocations: 0 bytes)

Have you looked at the deque implementation in DataStructures.jl?
I think it uses the same idea.

Yes, I looked into DataStructures.jl and saw this idea used in DataStructures.jl/balanced_tree.jl at master · JuliaCollections/DataStructures.jl · GitHub for example. But there are other node based containers in DataStructures.jl which do not use bulk allocations AFAIU and the allocation mechanism is not customizable (using free lists or supporting multithreading …).

1 Like

Hi Vasily,

it looks to me your definition of a list

mutable struct Cons{T}
    car::T
    cdr::Union{Nothing, Cons{T}}
end
const List{T} = Union{Nothing, Cons{T}}

allows for a more performant implementation than DataStructures.jl’s

abstract type LinkedList{T} end

mutable struct Nil{T} <: LinkedList{T}
end

mutable struct Cons{T} <: LinkedList{T}
    head::T
    tail::LinkedList{T}
end

This reduces the advantage of using allocators to around a factor of two in my micro benchmarks.

Nice to know that :slight_smile:
I think it’s because I manually define List as a small union and not an abstract type.

The linked list implementation in DataStructures.jl seems very basic, without much care about optimization. I guess it’s not widely used, otherwise there’d be something more clever.

How do you plan using allocators with dynamically changed nodes? E.g. when a node is deleted, or a new one added inbetween the existing etc.

Honestly linked lists are almost always a really inefficient datastructure. You waste a bunch of memory storing pointers, and you are making unpredictable memory accesses since you have to jump from node to node. Also, you lose the ability to do vectorized comparisons which will slow you down a lot. If you actually wanted a good datastructure with similar performance characteristics to a linked list, you would store multiple entries per node to reduce the cost of the pointer chasing.

1 Like

Hi Vasily,

you asked:

How do you plan using allocators with dynamically changed nodes? E.g. when a node is deleted, or a new one added inbetween the existing etc.

that would be an allocator keeping an internal free list. Didn’t bother for now because there is lots of existing art out there, I believe.

Edit: just committed an example for an allocator managing a free list.

I think, I have some numbers regarding the original question.

My rather artificial tests:

function build(n)
    tree = nil()
    for i in 1:n
        tree = insert(tree, i)
    end
    tree
end

function find(n, tree)
    for i in 1:n
        @assert haskey(tree, i)
    end
end

function build(n, alloc)
    tree = nil(alloc)
    for i in 1:n
        tree = insert(tree, i, alloc)
    end
    tree
end

function find(n, tree, alloc)
    for i in 1:n
        @assert haskey(tree, i, alloc)
    end
end

const test_M = 1000000
const test_N = 1

function test10()
    tree = Set{Int}()
    for i in 1:test_M
        for j in 1:test_N
            push!(tree, j)
        end
        for j in 1:test_N
            @assert j in tree
        end
        empty!(tree)
        # tree = Set{Int}()
    end
end

function test11()
    tree = DataStructures.SortedSet{Int}()
    for i in 1:test_M
        for j in 1:test_N
            push!(tree, j)
        end
        for j in 1:test_N
            @assert haskey(tree, j)
        end
        empty!(tree)
        # tree = DataStructures.SortedSet{Int}()
    end
end

function test12()
    for i in 1:test_M
        tree = build(test_N)
        find(test_N, tree)
    end
end

function test13()
    alloc = StaticAllocator{TreeNode{Int, Int}, Int}(test_N)
    for i in 1:test_M
        tree = build(test_N, alloc)
        find(test_N, tree, alloc)
        emptyend!(alloc)
    end
end

function test14()
    alloc = VariableAllocator{TreeNode{Int, Int}, Int}()
    for i in 1:test_M
        tree = build(test_N, alloc)
        find(test_N, tree, alloc)
        emptyend!(alloc)
    end
end

GC.gc()
@btime test10()
GC.gc()
@btime test11()
GC.gc()
@btime test12()
GC.gc()
@btime test13()
GC.gc()
@btime test14()
GC.gc()

produce the numbers

  28.198 ms (4 allocations: 480 bytes)
  99.674 ms (2000013 allocations: 152.59 MiB)
  10.848 ms (1000000 allocations: 45.78 MiB)
  4.905 ms (2 allocations: 672 bytes)
  9.220 ms (2 allocations: 672 bytes)

The (unverified) numbers show to me that I can stop micro-optimizing for now and that allocators could be generally useful (and BTW for realistic workloads the numbers do not look that good;)

Edit: for the sake of completeness a more normal workload

const test_M = 1000
const test_N = 1000

with results

 10.738 ms (17 allocations: 49.56 KiB)
  74.881 ms (2032 allocations: 286.03 KiB)
  229.256 ms (1000000 allocations: 45.78 MiB)
  72.362 ms (3 allocations: 31.38 KiB)
  74.386 ms (8 allocations: 64.03 KiB)

If someone wants to tinker with the code: I’ve uploaded it to GitHub - goerch/Allocators.jl: Examples for the use of allocators in Julia

1 Like