Yes, looking at previous discussions it seems like Matlab’s sort functions are multi-threaded and heavily optimized, so they will be tough to beat. I did a naive threaded implementation here, which starts to have some gains over single-threaded for large a
:
julia> using Base.Threads
julia> nthreads()
2
julia> function maxk_threaded(a, k)
ix = Vector{Int}(undef, k*nthreads())
block_size = ceil(Int, length(a)/nthreads())
@threads for thread_id in 1:nthreads()
ix_start = (thread_id-1)*block_size
ix_end = min(length(a), thread_id*block_size)
ix[((thread_id-1)*k+1):(thread_id*k)] = ix_start .+ partialsortperm(@view(a[(1+ix_start):ix_end]), 1:k, rev=true)
end
partialsortperm!(ix, a, 1:k, rev=true, initialized=true)
@views collect(zip(ix[1:k], a[ix[1:k]]))
end
maxk_threaded (generic function with 1 method)
julia> a = randn(10000);
julia> @btime maxk($a, 10)
70.201 μs (9 allocations: 78.73 KiB)
10-element Array{Tuple{Int64,Float64},1}:
(3840, 3.7524106800393)
(1359, 3.667162944745407)
(4738, 3.46128657912246)
(8532, 3.349067643815953)
(8314, 3.3363898988561234)
(3542, 3.3297030468239965)
(1159, 3.2795246783768923)
(9436, 3.259918244413647)
(9418, 3.254388944717796)
(2198, 3.155524296051535)
julia> @btime maxk_threaded($a, 10)
53.894 μs (23 allocations: 1.58 KiB)
10-element Array{Tuple{Int64,Float64},1}:
(3840, 3.7524106800393)
(1359, 3.667162944745407)
(4738, 3.46128657912246)
(8532, 3.349067643815953)
(8314, 3.3363898988561234)
(3542, 3.3297030468239965)
(1159, 3.2795246783768923)
(9436, 3.259918244413647)
(9418, 3.254388944717796)
(2198, 3.155524296051535)