The quasi-best MaxN function

This implementation, using a heap, seems to be much faster than the other ones posted here for both larger and smaller n. (Has a bit of extra complexity to handle type-unstable iterators, and to provide an in-place variant…I wanted to make it a good starting point for use in a future library.)

using DataStructures

"""
    heapmaxn!(itr, biggest::AbstractVector, n=length(biggest))

Modify `biggest` in-place to an array of the `n` biggest elements of
the union of `itr` and `biggest`, returning the new `biggest` array.
"""
function heapmaxn!(itr, biggest::AbstractVector{T}, n::Integer=length(biggest),::Val{realloc}=Val{false}()) where {T, realloc}
    n < 0 && throw(ArgumentError("$n should be nonnegative"))
    n == 0 && return empty!(biggest)
    sizehint!(biggest, n)
    heapify!(biggest)
    while length(biggest) > n
        heappop!(biggest)
    end
    i = iterate(itr)
    @inbounds while i !== nothing
        v = i[1]
        if length(biggest) < n
            heappush!(biggest, v)
        elseif isless(biggest[1], v)
            heappop!(biggest)
            if realloc && !(v isa T) # handle type-unstable iterator
                newbiggest = copyto!(similar(biggest, typejoin(typeof(v),T)), biggest)
                return heapmaxn!(Iterators.rest(itr, i[2]), heappush!(newbiggest, v), n)
            end
            heappush!(biggest, v)
        end
        i = iterate(itr, i[2])
    end
    return biggest
end

"""
    heapmaxn(itr, n)

Return an array of the `n` biggest elements of `itr`.
"""
function heapmaxn(itr, n::Integer)
    if Base.IteratorEltype(itr) isa Base.HasEltype
        return heapmaxn!(itr, empty!(Vector{eltype(itr)}(undef, n)), n)
    else
        @show i = iterate(itr)
        i === nothing && throw(ArgumentError("unknown eltype for empty collection"))
        n == 0 && return typeof(i[1])[]
        return heapmaxn!(Iterators.rest(itr, i[2]), [i[1]], n, Val{true}())
    end
end

giving (for re6=rand(1:10^8, 10^6) as above):

julia> @btime partialsort($re6, 1:5, rev=true);
       @btime MaxN4($re6,5);
       @btime heapmaxn($re6,5);
  11.785 ms (4 allocations: 7.63 MiB)
  4.660 ms (5 allocations: 7.63 MiB)
  1.504 ms (1 allocation: 128 bytes)

julia> @btime partialsort($re6, 1:15, rev=true);
       @btime MaxN4($re6,15);
       @btime heapmaxn($re6,15);
  11.334 ms (4 allocations: 7.63 MiB)
  15.821 ms (6 allocations: 7.63 MiB)
  1.512 ms (1 allocation: 208 bytes)

julia> @btime partialsort($re6, 1:500, rev=true);
       @btime MaxN4($re6,500);
       @btime heapmaxn($re6,500);
  11.486 ms (4 allocations: 7.63 MiB)
  556.260 ms (11 allocations: 7.64 MiB)
  1.775 ms (1 allocation: 4.06 KiB)

(Though heapmaxn is slower in the worst case where the inputs are sorted in ascending order. Of course, if you know that your inputs are mostly sorted, you could run heapmaxn(Iterators.reverse(itr), n) instead.)

(It’s even faster in Julia 1.6, apparently thanks to some compiler and/or inference improvements.)

5 Likes