# Fast large binary heap for SNIC super-pixel

Hello there,
I’m looking for help in order to optimize an implementation of the SNIC algorithm (Achanta et al 2017) for pre-segmenting image in super-pixels/voxels. The algorithm use a binary heap to get quick access to the pixel/voxel that have the lowest distance to the centroid of the super-pixel/voxel. I first implement it using DataStructures but I noticed that the code was horribly slow due to a lot of allocation (around 1GB for a test on the Cameraman 512x512 image !). I also tried to implement a preallocated (as I know in advance that the heap will never contain more element than pixels/voxels in the image) heap structure using StaticArrays but even so I indeed have no allocation for small heap I still get a lot of allocation when the heap start to be large…

Here is the code for the preallocated binary heap and the MWE showing the problem of allocation when the heap get large:

``````using StaticArrays

# Preallocated Binary Heap and some utility function for it
mutable struct PreallocatedBinaryHeap{N, T, O <: Base.Ordering}
ordering::O
tree::MVector{N, T}
n::Integer # index of the next entry
buf::T # buffer to avoid allocation

function PreallocatedBinaryHeap{N, T}(o::Base.Ordering) where {N, T}
@assert N > 0 "Preallocated length should be > 0"
new{N, T, typeof(o)}(o, MVector{N, T}(undef), 1, Vector{T}(undef, 1)[1])
end

function PreallocatedBinaryHeap{N, T}(o::Base.Ordering, v::AbstractVector) where {N, T}
n_el::Integer = length(v)
@assert N >= n_el "Preallocated length should be greater than vector length"
tree = MVector{N, T}(undef)
tree[1:n_el] .= heapify(v, o)
new{N, T, typeof(o)}(o, tree, n_el + 1, Vector{T}(undef, 1)[1])
end
end

PreallocatedBinaryHeap(o::Base.Ordering, v::AbstractVector{T}, N::Integer) where T = PreallocatedBinaryHeap{N, T}(o, v)

Base.length(h::PreallocatedBinaryHeap) = h.n - 1
Base.isempty(h::PreallocatedBinaryHeap) = h.n == 1
function isfull(h::PreallocatedBinaryHeap{N, T, O}) where {N, T, O}
return h.n > N
end

Base.getindex(h::PreallocatedBinaryHeap, i) = begin
@assert (i <= length(h)) && (i > 0) "Index out of bounds"
getindex(h.tree, i)
end
Base.iterate(h::PreallocatedBinaryHeap) = (h[1], 2)
Base.iterate(h::PreallocatedBinaryHeap, state) = (state <= length(h) ? (h[state], state + 1) : nothing)
Base.lastindex(h::PreallocatedBinaryHeap) = length(h)

Base.push!(h::PreallocatedBinaryHeap{N, T, O}, v::T) where {N, T, O} = begin
@assert !isfull(h) "The heap is full"
h.tree[h.n] = v
h.n += 1
percUp!(h, length(h))
end
Base.pop!(h::PreallocatedBinaryHeap) = begin
@assert !isempty(h) "The heap is empty"
h.buf = h.tree[1]
h.tree[1] = h.tree[length(h)]
h.n -= 1
percDown!(h, 1)
return h.buf
end

function percUp!(h::PreallocatedBinaryHeap, i::Integer)
parent = i >>> 1
@inbounds while parent > 0
# Compare to the parent and swap if needed
if Base.lt(h.ordering, h.tree[i], h.tree[parent])
h.buf = h.tree[parent]
#tmp = h.tree[parent]
h.tree[parent] = h.tree[i]
h.tree[i] = h.buf #tmp
end
i = parent
parent >>>= 1
end
end

function percUp!(v::AbstractVector, i::Integer, o::Base.Ordering)
parent = i >>> 1
@inbounds while parent > 0
# Compare to the parent and swap if needed
if Base.lt(o, v[i], v[parent])
tmp = v[parent]
v[parent] = v[i]
v[i] = tmp
end
i = parent
parent >>>= 1
end
end

function percDown!(h::PreallocatedBinaryHeap, i::Integer)
heapsize = length(h)
@inbounds while (2 * i) <= heapsize
# find the smallest child according to the ordering
if (2 * i + 1) > heapsize
sc = 2 * i
elseif Base.lt(h.ordering, h.tree[2 * i], h.tree[2 * i + 1])
sc = 2 * i
else
sc = 2 * i + 1
end

# swap if needed
if Base.lt(h.ordering, h.tree[sc], h.tree[i])
h.buf = h.tree[i]
#tmp = h.tree[i]
h.tree[i] = h.tree[sc]
h.tree[sc] = h.buf #tmp
end
i = sc
end
end

function percDown!(v::AbstractVector, i::Integer, o::Base.Ordering)
vectsize = length(v)
@inbounds while (2 * i) <= vectsize
# find the smallest child according to the ordering
if (2 * i + 1) > vectsize
sc = 2 * i
elseif Base.lt(o, v[2 * i], v[2 * i + 1])
sc = 2 * i
else
sc = 2 * i + 1
end

# swap if needed
if Base.lt(o, v[sc], v[i])
tmp = v[i]
v[i] = v[sc]
v[sc] = tmp
end
i = sc
end
end

function heapify!(v::AbstractVector, o::Base.Ordering)
for i in div(length(v), 2):-1:1
percDown!(v, i, o)
end
end

heapify(v::AbstractVector, o::Base.Ordering) = begin
v_copy = copyto!(similar(v), v)
heapify!(v_copy, o)
return v_copy
end
``````

And the MWE:

``````@time begin
# with a small heap
h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
while !isfull(h)
push!(h, rand(UInt8))
end
while !isempty(h)
pop!(h)
end
end
# after 1 execution to avoid compilation time
# 0.000333 seconds (12 allocations: 912 bytes)
# which correspond to the allocation needed to define the heap structure
@time h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
# 0.000026 seconds (12 allocations: 912 bytes)

@time begin
# with a large heap
h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
while !isfull(h)
push!(h, rand(UInt8))
end
while !isempty(h)
pop!(h)
end
end
# 2.434170 seconds (26.93 M allocations: 411.947 MiB, 2.10% gc time)
# larger than the required allocation for the heap
@time h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
# 0.000039 seconds (13 allocations: 977.328 KiB)
``````

My guess would be that it have something to do with memory alignment but may be I’m wrong?! I wanted to know if there is a possibility to have this heap structure to be much faster with size that are in the order of the million of element (which a common size for images)?

Actualy I was able to find the answer to my own question: it appears that I just need to tell Julia that the heap size will not change by defining it as a `struct` and not a `mutable struct` ! Doing so the gain of speed is consequent. I post here the code:
Few changes on the heap type definition:

``````struct PreallocatedBinaryHeap{N, T, O <: Base.Ordering}
ordering::O
tree::MVector{N, T}
n::MVector{1, Int}
buf::MVector{1, T}
#n::Integer # index of the next entry
#buf::T # buffer to avoid allocation

function PreallocatedBinaryHeap{N, T}(o::Base.Ordering) where {N, T}
@assert N > 0 "Preallocated length should be > 0"
new{N, T, typeof(o)}(o, MVector{N, T}(undef), ones(MVector{1, Int}), MVector{1, T}(undef))
#new{N, T, typeof(o)}(o, MVector{N, T}(undef), 1, Vector{T}(undef, 1)[1])
end

function PreallocatedBinaryHeap{N, T}(o::Base.Ordering, v::AbstractVector) where {N, T}
n_el::Integer = length(v)
@assert N >= n_el "Preallocated length should be greater than vector length"
tree = MVector{N, T}(undef)
tree[1:n_el] .= heapify(v, o)
new{N, T, typeof(o)}(o, tree, MVector{1, Int}(n_el + 1), MVector{1, T}(undef))
#new{N, T, typeof(o)}(o, tree, n_el + 1, Vector{T}(undef, 1)[1])
end
end

PreallocatedBinaryHeap(o::Base.Ordering, v::AbstractVector{T}, N::Integer) where T = PreallocatedBinaryHeap{N, T}(o, v)

Base.length(h::PreallocatedBinaryHeap) = h.n[1] - 1 #h.n - 1
Base.isempty(h::PreallocatedBinaryHeap) = h.n[1] == 1 #h.n == 1
function isfull(h::PreallocatedBinaryHeap{N, T, O}) where {N, T, O}
return h.n[1] > N #h.n > N
end

Base.getindex(h::PreallocatedBinaryHeap, i) = begin
@assert (i <= length(h)) && (i > 0) "Index out of bounds"
getindex(h.tree, i)
end
Base.iterate(h::PreallocatedBinaryHeap) = (h[1], 2)
Base.iterate(h::PreallocatedBinaryHeap, state) = (state <= length(h) ? (h[state], state + 1) : nothing)
Base.lastindex(h::PreallocatedBinaryHeap) = length(h)

Base.push!(h::PreallocatedBinaryHeap{N, T, O}, v::T) where {N, T, O} = begin
@assert !isfull(h) "The heap is full"
h.tree[h.n[1]] = v #h.tree[h.n] = v
h.n[1] += 1 #h.n += 1
percUp!(h, length(h))
end
Base.pop!(h::PreallocatedBinaryHeap) = begin
@assert !isempty(h) "The heap is empty"
h.buf[1] = h.tree[1] #h.buf = h.tree[1]
h.tree[1] = h.tree[length(h)]
h.n[1] -= 1 #h.n -= 1
percDown!(h, 1)
return h.buf[1] #h.buf
end

function percUp!(h::PreallocatedBinaryHeap, i::Integer)
parent = i >>> 1
@inbounds while parent > 0
# Compare to the parent and swap if needed
if Base.lt(h.ordering, h.tree[i], h.tree[parent])
h.buf[1] = h.tree[parent] #h.buf = h.tree[parent]
#tmp = h.tree[parent]
h.tree[parent] = h.tree[i]
h.tree[i] = h.buf[1] #h.buf #tmp
end
i = parent
parent >>>= 1
end
end

function percUp!(v::AbstractVector, i::Integer, o::Base.Ordering)
parent = i >>> 1
@inbounds while parent > 0
# Compare to the parent and swap if needed
if Base.lt(o, v[i], v[parent])
tmp = v[parent]
v[parent] = v[i]
v[i] = tmp
end
i = parent
parent >>>= 1
end
end

function percDown!(h::PreallocatedBinaryHeap, i::Integer)
heapsize = length(h)
@inbounds while (2 * i) <= heapsize
# find the smallest child according to the ordering
if (2 * i + 1) > heapsize
sc = 2 * i
elseif Base.lt(h.ordering, h.tree[2 * i], h.tree[2 * i + 1])
sc = 2 * i
else
sc = 2 * i + 1
end

# swap if needed
if Base.lt(h.ordering, h.tree[sc], h.tree[i])
h.buf[1] = h.tree[i] #h.buf = h.tree[i]
#tmp = h.tree[i]
h.tree[i] = h.tree[sc]
h.tree[sc] = h.buf[1] #h.buf #tmp
end
i = sc
end
end

function percDown!(v::AbstractVector, i::Integer, o::Base.Ordering)
vectsize = length(v)
@inbounds while (2 * i) <= vectsize
# find the smallest child according to the ordering
if (2 * i + 1) > vectsize
sc = 2 * i
elseif Base.lt(o, v[2 * i], v[2 * i + 1])
sc = 2 * i
else
sc = 2 * i + 1
end

# swap if needed
if Base.lt(o, v[sc], v[i])
tmp = v[i]
v[i] = v[sc]
v[sc] = tmp
end
i = sc
end
end

function heapify!(v::AbstractVector, o::Base.Ordering)
for i in div(length(v), 2):-1:1
percDown!(v, i, o)
end
end

heapify(v::AbstractVector, o::Base.Ordering) = begin
v_copy = copyto!(similar(v), v)
heapify!(v_copy, o)
return v_copy
end
``````

And the benchmark:

``````@time begin
h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
while !isfull(h)
push!(h, rand(UInt8))
end
while !isempty(h)
pop!(h)
end
end
# 0.000147 seconds (13 allocations: 848 bytes)

@time begin
h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
while !isfull(h)
push!(h, rand(UInt8))
end
while !isempty(h)
pop!(h)
end
end
# 0.222954 seconds (14 allocations: 977.266 KiB)
``````

If you put your code inside a function it might be even faster.