Hello there,
I’m looking for help in order to optimize an implementation of the SNIC algorithm (Achanta et al 2017) for pre-segmenting image in super-pixels/voxels. The algorithm use a binary heap to get quick access to the pixel/voxel that have the lowest distance to the centroid of the super-pixel/voxel. I first implement it using DataStructures but I noticed that the code was horribly slow due to a lot of allocation (around 1GB for a test on the Cameraman 512x512 image !). I also tried to implement a preallocated (as I know in advance that the heap will never contain more element than pixels/voxels in the image) heap structure using StaticArrays but even so I indeed have no allocation for small heap I still get a lot of allocation when the heap start to be large…
Here is the code for the preallocated binary heap and the MWE showing the problem of allocation when the heap get large:
using StaticArrays
# Preallocated Binary Heap and some utility function for it
mutable struct PreallocatedBinaryHeap{N, T, O <: Base.Ordering}
ordering::O
tree::MVector{N, T}
n::Integer # index of the next entry
buf::T # buffer to avoid allocation
function PreallocatedBinaryHeap{N, T}(o::Base.Ordering) where {N, T}
@assert N > 0 "Preallocated length should be > 0"
new{N, T, typeof(o)}(o, MVector{N, T}(undef), 1, Vector{T}(undef, 1)[1])
end
function PreallocatedBinaryHeap{N, T}(o::Base.Ordering, v::AbstractVector) where {N, T}
n_el::Integer = length(v)
@assert N >= n_el "Preallocated length should be greater than vector length"
tree = MVector{N, T}(undef)
tree[1:n_el] .= heapify(v, o)
new{N, T, typeof(o)}(o, tree, n_el + 1, Vector{T}(undef, 1)[1])
end
end
PreallocatedBinaryHeap(o::Base.Ordering, v::AbstractVector{T}, N::Integer) where T = PreallocatedBinaryHeap{N, T}(o, v)
Base.length(h::PreallocatedBinaryHeap) = h.n - 1
Base.isempty(h::PreallocatedBinaryHeap) = h.n == 1
function isfull(h::PreallocatedBinaryHeap{N, T, O}) where {N, T, O}
return h.n > N
end
Base.getindex(h::PreallocatedBinaryHeap, i) = begin
@assert (i <= length(h)) && (i > 0) "Index out of bounds"
getindex(h.tree, i)
end
Base.iterate(h::PreallocatedBinaryHeap) = (h[1], 2)
Base.iterate(h::PreallocatedBinaryHeap, state) = (state <= length(h) ? (h[state], state + 1) : nothing)
Base.lastindex(h::PreallocatedBinaryHeap) = length(h)
Base.push!(h::PreallocatedBinaryHeap{N, T, O}, v::T) where {N, T, O} = begin
@assert !isfull(h) "The heap is full"
h.tree[h.n] = v
h.n += 1
percUp!(h, length(h))
end
Base.pop!(h::PreallocatedBinaryHeap) = begin
@assert !isempty(h) "The heap is empty"
h.buf = h.tree[1]
h.tree[1] = h.tree[length(h)]
h.n -= 1
percDown!(h, 1)
return h.buf
end
function percUp!(h::PreallocatedBinaryHeap, i::Integer)
parent = i >>> 1
@inbounds while parent > 0
# Compare to the parent and swap if needed
if Base.lt(h.ordering, h.tree[i], h.tree[parent])
h.buf = h.tree[parent]
#tmp = h.tree[parent]
h.tree[parent] = h.tree[i]
h.tree[i] = h.buf #tmp
end
i = parent
parent >>>= 1
end
end
function percUp!(v::AbstractVector, i::Integer, o::Base.Ordering)
parent = i >>> 1
@inbounds while parent > 0
# Compare to the parent and swap if needed
if Base.lt(o, v[i], v[parent])
tmp = v[parent]
v[parent] = v[i]
v[i] = tmp
end
i = parent
parent >>>= 1
end
end
function percDown!(h::PreallocatedBinaryHeap, i::Integer)
heapsize = length(h)
@inbounds while (2 * i) <= heapsize
# find the smallest child according to the ordering
if (2 * i + 1) > heapsize
sc = 2 * i
elseif Base.lt(h.ordering, h.tree[2 * i], h.tree[2 * i + 1])
sc = 2 * i
else
sc = 2 * i + 1
end
# swap if needed
if Base.lt(h.ordering, h.tree[sc], h.tree[i])
h.buf = h.tree[i]
#tmp = h.tree[i]
h.tree[i] = h.tree[sc]
h.tree[sc] = h.buf #tmp
end
i = sc
end
end
function percDown!(v::AbstractVector, i::Integer, o::Base.Ordering)
vectsize = length(v)
@inbounds while (2 * i) <= vectsize
# find the smallest child according to the ordering
if (2 * i + 1) > vectsize
sc = 2 * i
elseif Base.lt(o, v[2 * i], v[2 * i + 1])
sc = 2 * i
else
sc = 2 * i + 1
end
# swap if needed
if Base.lt(o, v[sc], v[i])
tmp = v[i]
v[i] = v[sc]
v[sc] = tmp
end
i = sc
end
end
function heapify!(v::AbstractVector, o::Base.Ordering)
for i in div(length(v), 2):-1:1
percDown!(v, i, o)
end
end
heapify(v::AbstractVector, o::Base.Ordering) = begin
v_copy = copyto!(similar(v), v)
heapify!(v_copy, o)
return v_copy
end
And the MWE:
@time begin
# with a small heap
h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
while !isfull(h)
push!(h, rand(UInt8))
end
while !isempty(h)
pop!(h)
end
end
# after 1 execution to avoid compilation time
# 0.000333 seconds (12 allocations: 912 bytes)
# which correspond to the allocation needed to define the heap structure
@time h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
# 0.000026 seconds (12 allocations: 912 bytes)
@time begin
# with a large heap
h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
while !isfull(h)
push!(h, rand(UInt8))
end
while !isempty(h)
pop!(h)
end
end
# 2.434170 seconds (26.93 M allocations: 411.947 MiB, 2.10% gc time)
# larger than the required allocation for the heap
@time h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
# 0.000039 seconds (13 allocations: 977.328 KiB)
My guess would be that it have something to do with memory alignment but may be I’m wrong?! I wanted to know if there is a possibility to have this heap structure to be much faster with size that are in the order of the million of element (which a common size for images)?