Fast large binary heap for SNIC super-pixel

Hello there,
I’m looking for help in order to optimize an implementation of the SNIC algorithm (Achanta et al 2017) for pre-segmenting image in super-pixels/voxels. The algorithm use a binary heap to get quick access to the pixel/voxel that have the lowest distance to the centroid of the super-pixel/voxel. I first implement it using DataStructures but I noticed that the code was horribly slow due to a lot of allocation (around 1GB for a test on the Cameraman 512x512 image !). I also tried to implement a preallocated (as I know in advance that the heap will never contain more element than pixels/voxels in the image) heap structure using StaticArrays but even so I indeed have no allocation for small heap I still get a lot of allocation when the heap start to be large…

Here is the code for the preallocated binary heap and the MWE showing the problem of allocation when the heap get large:

using StaticArrays

# Preallocated Binary Heap and some utility function for it
mutable struct PreallocatedBinaryHeap{N, T, O <: Base.Ordering}
    ordering::O
    tree::MVector{N, T}
    n::Integer # index of the next entry
    buf::T # buffer to avoid allocation

    function PreallocatedBinaryHeap{N, T}(o::Base.Ordering) where {N, T}
        @assert N > 0 "Preallocated length should be > 0"
        new{N, T, typeof(o)}(o, MVector{N, T}(undef), 1, Vector{T}(undef, 1)[1])
    end

    function PreallocatedBinaryHeap{N, T}(o::Base.Ordering, v::AbstractVector) where {N, T}
        n_el::Integer = length(v)
        @assert N >= n_el "Preallocated length should be greater than vector length"
        tree = MVector{N, T}(undef)
        tree[1:n_el] .= heapify(v, o)
        new{N, T, typeof(o)}(o, tree, n_el + 1, Vector{T}(undef, 1)[1])
    end
end

PreallocatedBinaryHeap(o::Base.Ordering, v::AbstractVector{T}, N::Integer) where T = PreallocatedBinaryHeap{N, T}(o, v)

Base.length(h::PreallocatedBinaryHeap) = h.n - 1
Base.isempty(h::PreallocatedBinaryHeap) = h.n == 1
function isfull(h::PreallocatedBinaryHeap{N, T, O}) where {N, T, O}
    return h.n > N
end

Base.getindex(h::PreallocatedBinaryHeap, i) = begin
    @assert (i <= length(h)) && (i > 0) "Index out of bounds"
    getindex(h.tree, i)
end
Base.iterate(h::PreallocatedBinaryHeap) = (h[1], 2)
Base.iterate(h::PreallocatedBinaryHeap, state) = (state <= length(h) ? (h[state], state + 1) : nothing)
Base.lastindex(h::PreallocatedBinaryHeap) = length(h)

Base.push!(h::PreallocatedBinaryHeap{N, T, O}, v::T) where {N, T, O} = begin
    @assert !isfull(h) "The heap is full"
    h.tree[h.n] = v
    h.n += 1
    percUp!(h, length(h))
end
Base.pop!(h::PreallocatedBinaryHeap) = begin
    @assert !isempty(h) "The heap is empty"
    h.buf = h.tree[1]
    h.tree[1] = h.tree[length(h)]
    h.n -= 1
    percDown!(h, 1)
    return h.buf
end

function percUp!(h::PreallocatedBinaryHeap, i::Integer)
    parent = i >>> 1
    @inbounds while parent > 0
        # Compare to the parent and swap if needed
        if Base.lt(h.ordering, h.tree[i], h.tree[parent])
            h.buf = h.tree[parent]
            #tmp = h.tree[parent]
            h.tree[parent] = h.tree[i]
            h.tree[i] = h.buf #tmp
        end
        i = parent
        parent >>>= 1
    end
end

function percUp!(v::AbstractVector, i::Integer, o::Base.Ordering)
    parent = i >>> 1
    @inbounds while parent > 0
        # Compare to the parent and swap if needed
        if Base.lt(o, v[i], v[parent])
            tmp = v[parent]
            v[parent] = v[i]
            v[i] = tmp
        end
        i = parent
        parent >>>= 1
    end
end

function percDown!(h::PreallocatedBinaryHeap, i::Integer)
    heapsize = length(h)
    @inbounds while (2 * i) <= heapsize
        # find the smallest child according to the ordering
        if (2 * i + 1) > heapsize
            sc = 2 * i
        elseif Base.lt(h.ordering, h.tree[2 * i], h.tree[2 * i + 1])
            sc = 2 * i
        else
            sc = 2 * i + 1
        end

        # swap if needed
        if Base.lt(h.ordering, h.tree[sc], h.tree[i])
            h.buf = h.tree[i]
            #tmp = h.tree[i]
            h.tree[i] = h.tree[sc]
            h.tree[sc] = h.buf #tmp
        end
        i = sc
    end
end

function percDown!(v::AbstractVector, i::Integer, o::Base.Ordering)
    vectsize = length(v)
    @inbounds while (2 * i) <= vectsize
        # find the smallest child according to the ordering
        if (2 * i + 1) > vectsize
            sc = 2 * i
        elseif Base.lt(o, v[2 * i], v[2 * i + 1])
            sc = 2 * i
        else
            sc = 2 * i + 1
        end

        # swap if needed
        if Base.lt(o, v[sc], v[i])
            tmp = v[i]
            v[i] = v[sc]
            v[sc] = tmp
        end
        i = sc
    end
end

function heapify!(v::AbstractVector, o::Base.Ordering)
    for i in div(length(v), 2):-1:1
        percDown!(v, i, o)
    end
end

heapify(v::AbstractVector, o::Base.Ordering) = begin
    v_copy = copyto!(similar(v), v)
    heapify!(v_copy, o)
    return v_copy
end

And the MWE:

@time begin
    # with a small heap
    h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
    while !isfull(h)
        push!(h, rand(UInt8))
    end
    while !isempty(h)
        pop!(h)
    end
end
# after 1 execution to avoid compilation time
# 0.000333 seconds (12 allocations: 912 bytes)
# which correspond to the allocation needed to define the heap structure
@time h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
# 0.000026 seconds (12 allocations: 912 bytes)

@time begin
    # with a large heap
    h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
    while !isfull(h)
        push!(h, rand(UInt8))
    end
    while !isempty(h)
        pop!(h)
    end
end
# 2.434170 seconds (26.93 M allocations: 411.947 MiB, 2.10% gc time)
# larger than the required allocation for the heap
@time h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
# 0.000039 seconds (13 allocations: 977.328 KiB)

My guess would be that it have something to do with memory alignment but may be I’m wrong?! I wanted to know if there is a possibility to have this heap structure to be much faster with size that are in the order of the million of element (which a common size for images)?

Actualy I was able to find the answer to my own question: it appears that I just need to tell Julia that the heap size will not change by defining it as a struct and not a mutable struct ! Doing so the gain of speed is consequent. I post here the code:
Few changes on the heap type definition:

struct PreallocatedBinaryHeap{N, T, O <: Base.Ordering}
    ordering::O
    tree::MVector{N, T}
    n::MVector{1, Int}
    buf::MVector{1, T}
    #n::Integer # index of the next entry
    #buf::T # buffer to avoid allocation

    function PreallocatedBinaryHeap{N, T}(o::Base.Ordering) where {N, T}
        @assert N > 0 "Preallocated length should be > 0"
        new{N, T, typeof(o)}(o, MVector{N, T}(undef), ones(MVector{1, Int}), MVector{1, T}(undef))
        #new{N, T, typeof(o)}(o, MVector{N, T}(undef), 1, Vector{T}(undef, 1)[1])
    end

    function PreallocatedBinaryHeap{N, T}(o::Base.Ordering, v::AbstractVector) where {N, T}
        n_el::Integer = length(v)
        @assert N >= n_el "Preallocated length should be greater than vector length"
        tree = MVector{N, T}(undef)
        tree[1:n_el] .= heapify(v, o)
        new{N, T, typeof(o)}(o, tree, MVector{1, Int}(n_el + 1), MVector{1, T}(undef))
        #new{N, T, typeof(o)}(o, tree, n_el + 1, Vector{T}(undef, 1)[1])
    end
end

PreallocatedBinaryHeap(o::Base.Ordering, v::AbstractVector{T}, N::Integer) where T = PreallocatedBinaryHeap{N, T}(o, v)

Base.length(h::PreallocatedBinaryHeap) = h.n[1] - 1 #h.n - 1
Base.isempty(h::PreallocatedBinaryHeap) = h.n[1] == 1 #h.n == 1
function isfull(h::PreallocatedBinaryHeap{N, T, O}) where {N, T, O}
    return h.n[1] > N #h.n > N
end

Base.getindex(h::PreallocatedBinaryHeap, i) = begin
    @assert (i <= length(h)) && (i > 0) "Index out of bounds"
    getindex(h.tree, i)
end
Base.iterate(h::PreallocatedBinaryHeap) = (h[1], 2)
Base.iterate(h::PreallocatedBinaryHeap, state) = (state <= length(h) ? (h[state], state + 1) : nothing)
Base.lastindex(h::PreallocatedBinaryHeap) = length(h)

Base.push!(h::PreallocatedBinaryHeap{N, T, O}, v::T) where {N, T, O} = begin
    @assert !isfull(h) "The heap is full"
    h.tree[h.n[1]] = v #h.tree[h.n] = v
    h.n[1] += 1 #h.n += 1
    percUp!(h, length(h))
end
Base.pop!(h::PreallocatedBinaryHeap) = begin
    @assert !isempty(h) "The heap is empty"
    h.buf[1] = h.tree[1] #h.buf = h.tree[1]
    h.tree[1] = h.tree[length(h)]
    h.n[1] -= 1 #h.n -= 1
    percDown!(h, 1)
    return h.buf[1] #h.buf
end

function percUp!(h::PreallocatedBinaryHeap, i::Integer)
    parent = i >>> 1
    @inbounds while parent > 0
        # Compare to the parent and swap if needed
        if Base.lt(h.ordering, h.tree[i], h.tree[parent])
            h.buf[1] = h.tree[parent] #h.buf = h.tree[parent]
            #tmp = h.tree[parent]
            h.tree[parent] = h.tree[i]
            h.tree[i] = h.buf[1] #h.buf #tmp
        end
        i = parent
        parent >>>= 1
    end
end

function percUp!(v::AbstractVector, i::Integer, o::Base.Ordering)
    parent = i >>> 1
    @inbounds while parent > 0
        # Compare to the parent and swap if needed
        if Base.lt(o, v[i], v[parent])
            tmp = v[parent]
            v[parent] = v[i]
            v[i] = tmp
        end
        i = parent
        parent >>>= 1
    end
end

function percDown!(h::PreallocatedBinaryHeap, i::Integer)
    heapsize = length(h)
    @inbounds while (2 * i) <= heapsize
        # find the smallest child according to the ordering
        if (2 * i + 1) > heapsize
            sc = 2 * i
        elseif Base.lt(h.ordering, h.tree[2 * i], h.tree[2 * i + 1])
            sc = 2 * i
        else
            sc = 2 * i + 1
        end

        # swap if needed
        if Base.lt(h.ordering, h.tree[sc], h.tree[i])
            h.buf[1] = h.tree[i] #h.buf = h.tree[i]
            #tmp = h.tree[i]
            h.tree[i] = h.tree[sc]
            h.tree[sc] = h.buf[1] #h.buf #tmp
        end
        i = sc
    end
end

function percDown!(v::AbstractVector, i::Integer, o::Base.Ordering)
    vectsize = length(v)
    @inbounds while (2 * i) <= vectsize
        # find the smallest child according to the ordering
        if (2 * i + 1) > vectsize
            sc = 2 * i
        elseif Base.lt(o, v[2 * i], v[2 * i + 1])
            sc = 2 * i
        else
            sc = 2 * i + 1
        end

        # swap if needed
        if Base.lt(o, v[sc], v[i])
            tmp = v[i]
            v[i] = v[sc]
            v[sc] = tmp
        end
        i = sc
    end
end

function heapify!(v::AbstractVector, o::Base.Ordering)
    for i in div(length(v), 2):-1:1
        percDown!(v, i, o)
    end
end

heapify(v::AbstractVector, o::Base.Ordering) = begin
    v_copy = copyto!(similar(v), v)
    heapify!(v_copy, o)
    return v_copy
end

And the benchmark:

@time begin
    h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
    while !isfull(h)
        push!(h, rand(UInt8))
    end
    while !isempty(h)
        pop!(h)
    end
end
# 0.000147 seconds (13 allocations: 848 bytes)

@time begin
    h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
    while !isfull(h)
        push!(h, rand(UInt8))
    end
    while !isempty(h)
        pop!(h)
    end
end
# 0.222954 seconds (14 allocations: 977.266 KiB)

If you put your code inside a function it might be even faster.