Fast large binary heap for SNIC super-pixel

Actualy I was able to find the answer to my own question: it appears that I just need to tell Julia that the heap size will not change by defining it as a struct and not a mutable struct ! Doing so the gain of speed is consequent. I post here the code:
Few changes on the heap type definition:

struct PreallocatedBinaryHeap{N, T, O <: Base.Ordering}
    ordering::O
    tree::MVector{N, T}
    n::MVector{1, Int}
    buf::MVector{1, T}
    #n::Integer # index of the next entry
    #buf::T # buffer to avoid allocation

    function PreallocatedBinaryHeap{N, T}(o::Base.Ordering) where {N, T}
        @assert N > 0 "Preallocated length should be > 0"
        new{N, T, typeof(o)}(o, MVector{N, T}(undef), ones(MVector{1, Int}), MVector{1, T}(undef))
        #new{N, T, typeof(o)}(o, MVector{N, T}(undef), 1, Vector{T}(undef, 1)[1])
    end

    function PreallocatedBinaryHeap{N, T}(o::Base.Ordering, v::AbstractVector) where {N, T}
        n_el::Integer = length(v)
        @assert N >= n_el "Preallocated length should be greater than vector length"
        tree = MVector{N, T}(undef)
        tree[1:n_el] .= heapify(v, o)
        new{N, T, typeof(o)}(o, tree, MVector{1, Int}(n_el + 1), MVector{1, T}(undef))
        #new{N, T, typeof(o)}(o, tree, n_el + 1, Vector{T}(undef, 1)[1])
    end
end

PreallocatedBinaryHeap(o::Base.Ordering, v::AbstractVector{T}, N::Integer) where T = PreallocatedBinaryHeap{N, T}(o, v)

Base.length(h::PreallocatedBinaryHeap) = h.n[1] - 1 #h.n - 1
Base.isempty(h::PreallocatedBinaryHeap) = h.n[1] == 1 #h.n == 1
function isfull(h::PreallocatedBinaryHeap{N, T, O}) where {N, T, O}
    return h.n[1] > N #h.n > N
end

Base.getindex(h::PreallocatedBinaryHeap, i) = begin
    @assert (i <= length(h)) && (i > 0) "Index out of bounds"
    getindex(h.tree, i)
end
Base.iterate(h::PreallocatedBinaryHeap) = (h[1], 2)
Base.iterate(h::PreallocatedBinaryHeap, state) = (state <= length(h) ? (h[state], state + 1) : nothing)
Base.lastindex(h::PreallocatedBinaryHeap) = length(h)

Base.push!(h::PreallocatedBinaryHeap{N, T, O}, v::T) where {N, T, O} = begin
    @assert !isfull(h) "The heap is full"
    h.tree[h.n[1]] = v #h.tree[h.n] = v
    h.n[1] += 1 #h.n += 1
    percUp!(h, length(h))
end
Base.pop!(h::PreallocatedBinaryHeap) = begin
    @assert !isempty(h) "The heap is empty"
    h.buf[1] = h.tree[1] #h.buf = h.tree[1]
    h.tree[1] = h.tree[length(h)]
    h.n[1] -= 1 #h.n -= 1
    percDown!(h, 1)
    return h.buf[1] #h.buf
end

function percUp!(h::PreallocatedBinaryHeap, i::Integer)
    parent = i >>> 1
    @inbounds while parent > 0
        # Compare to the parent and swap if needed
        if Base.lt(h.ordering, h.tree[i], h.tree[parent])
            h.buf[1] = h.tree[parent] #h.buf = h.tree[parent]
            #tmp = h.tree[parent]
            h.tree[parent] = h.tree[i]
            h.tree[i] = h.buf[1] #h.buf #tmp
        end
        i = parent
        parent >>>= 1
    end
end

function percUp!(v::AbstractVector, i::Integer, o::Base.Ordering)
    parent = i >>> 1
    @inbounds while parent > 0
        # Compare to the parent and swap if needed
        if Base.lt(o, v[i], v[parent])
            tmp = v[parent]
            v[parent] = v[i]
            v[i] = tmp
        end
        i = parent
        parent >>>= 1
    end
end

function percDown!(h::PreallocatedBinaryHeap, i::Integer)
    heapsize = length(h)
    @inbounds while (2 * i) <= heapsize
        # find the smallest child according to the ordering
        if (2 * i + 1) > heapsize
            sc = 2 * i
        elseif Base.lt(h.ordering, h.tree[2 * i], h.tree[2 * i + 1])
            sc = 2 * i
        else
            sc = 2 * i + 1
        end

        # swap if needed
        if Base.lt(h.ordering, h.tree[sc], h.tree[i])
            h.buf[1] = h.tree[i] #h.buf = h.tree[i]
            #tmp = h.tree[i]
            h.tree[i] = h.tree[sc]
            h.tree[sc] = h.buf[1] #h.buf #tmp
        end
        i = sc
    end
end

function percDown!(v::AbstractVector, i::Integer, o::Base.Ordering)
    vectsize = length(v)
    @inbounds while (2 * i) <= vectsize
        # find the smallest child according to the ordering
        if (2 * i + 1) > vectsize
            sc = 2 * i
        elseif Base.lt(o, v[2 * i], v[2 * i + 1])
            sc = 2 * i
        else
            sc = 2 * i + 1
        end

        # swap if needed
        if Base.lt(o, v[sc], v[i])
            tmp = v[i]
            v[i] = v[sc]
            v[sc] = tmp
        end
        i = sc
    end
end

function heapify!(v::AbstractVector, o::Base.Ordering)
    for i in div(length(v), 2):-1:1
        percDown!(v, i, o)
    end
end

heapify(v::AbstractVector, o::Base.Ordering) = begin
    v_copy = copyto!(similar(v), v)
    heapify!(v_copy, o)
    return v_copy
end

And the benchmark:

@time begin
    h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 200)
    while !isfull(h)
        push!(h, rand(UInt8))
    end
    while !isempty(h)
        pop!(h)
    end
end
# 0.000147 seconds (13 allocations: 848 bytes)

@time begin
    h = PreallocatedBinaryHeap(Base.Forward, rand(UInt8, 10), 1_000_000)
    while !isfull(h)
        push!(h, rand(UInt8))
    end
    while !isempty(h)
        pop!(h)
    end
end
# 0.222954 seconds (14 allocations: 977.266 KiB)