Idiomatic way to find extreme and the next (less) extreme unique values

pauljurczak · July 1, 2020, 2:10am

I have:

julia> a = [3, 1, 4, 1, Inf, Inf]
julia> extrema(a)
(1.0, Inf)

I would like to find the next pair of extreme values of array a , which are different from the original ones, i.e. laying inside (1.0, Inf) interval. In the case above it will be the (3, 4) tuple. What is the idiomatic way of expressing it? Sorting is fine, I don’t care about performance here.

pdeffebach · July 1, 2020, 3:12am

I think this should do it? Maybe not exactly.

julia> function narrow(x, n)
       xs = sort!(unique(x))
       return (xs[n], xs[length(xs) - n + 1])
       end

jling · July 1, 2020, 3:18am

or your can

e = extrema(a)
filter(!in(e), a) |> extrema

pauljurczak · July 1, 2020, 3:42am

Very nice, thank you. The specific use case I was looking for, is a variant of your solution:

filter(!in(Inf), a) |> extrema

or

filter(!in((-Inf, Inf)), a) |> extrema

baggepinnen · July 1, 2020, 5:38am

You also have partialsort but I’m not sure how efficient it is

julia> a = randn(10)
10-element Array{Float64,1}:
 -0.3445076168022343
  1.0346173940108812
 -1.0549856989794997
  0.42557277939095345
  0.41959768717730606
  0.33370566251639655
 -0.6417930608565745
 -0.2651253038034898
  1.738265093629218
  1.6703669085548287

julia> partialsort(a, 1:2)
2-element view(::Array{Float64,1}, 1:2) with eltype Float64:
 -1.0549856989794997
 -0.6417930608565745

julia> partialsort(a, 9:10)
2-element view(::Array{Float64,1}, 9:10) with eltype Float64:
 1.6703669085548287
 1.738265093629218

pauljurczak · July 1, 2020, 5:54am

Good to know, but this approach is a bit problematic due to uniqueness requirement. It would have to go like this:

b = unique(a)
partialsort(b, 1:2)
partialsort(b, length(b)-1:length(b))

assuming that length(b) > 1

GunnarFarneback · July 1, 2020, 5:58am

If performance were on the table, it would be hard to beat looping over the elements and keeping track of the two smallest and two largest values. For conciseness the filtering solution looks perfectly fine.

klaff · July 1, 2020, 6:09am

Haha, I was just trying this and the logic was surprisingly confusing to me, but the advantage is rather large:


function twoextremas(a)
    e = extrema(a)
    e2 = filter(!in(e), a) |> extrema
    return e,e2
end

function twoextremas_loop(a)
    mn,min2 = Inf,Inf
    mx,max2 = -Inf,-Inf
    for item ∈ a
        if item < mn
            min2 = mn
            mn = item
        elseif item < min2
            min2 = item
        end
        if item > mx
            max2 = mx
            mx = item
        elseif item > max2
            max2 = item
        end
    end
    return (mn,mx),(min2,max2)
end

a = randn(1_000_000)

@assert twoextremas(a) == twoextremas_loop(a)

@btime twoextremas($a)
@btime twoextremas_loop($a)

which gave an order of magnitude in performance:

15.287 ms (2 allocations: 7.63 MiB)
1.428 ms (0 allocations: 0 bytes)

[EDIT - as pointed out below, this function does not meet the desired specification]

pauljurczak · July 1, 2020, 6:21am

Timings on my PC are so close to 10x, they almost look fake

  13.558 ms (2 allocations: 7.63 MiB)
  1.358 ms (0 allocations: 0 bytes)

baggepinnen · July 1, 2020, 6:54am

Can make it quite a bit faster by moving the second if statement into the first

function twoextremas_loop2(a)
    mn,min2 = Inf,Inf
    mx,max2 = -Inf,-Inf
    for item ∈ a
        if item < mn
            min2 = mn
            mn = item
        elseif item < min2
            min2 = item
        else
            if item > mx
                max2 = mx
                mx = item
            elseif item > max2
                max2 = item
            end
        end
    end
    return (mn,mx),(min2,max2)
end


julia> @btime twoextremas($a)
  12.151 ms (2 allocations: 7.63 MiB)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))

julia> @btime twoextremas_loop($a)
  1.469 ms (0 allocations: 0 bytes)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))

julia> @btime twoextremas_loop2($a)
  992.839 μs (0 allocations: 0 bytes)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))

pauljurczak · July 1, 2020, 10:21am

Small correction is needed for it to work correctly on arrays with repetitions, e.g. [3, 1, 4, 1, Inf, Inf]

function twoextremas_loop3(a)
  mn,min2 = Inf,Inf
  mx,max2 = -Inf,-Inf

  for item ∈ a
      if item < mn
          min2 = mn
          mn = item
      elseif mn < item < min2
          min2 = item
      elseif item > mx
          max2 = mx
          mx = item
      elseif max2 < item < mx
          max2 = item
      end
  end
  
  return (mn,mx),(min2,max2)
end

klaff · July 1, 2020, 1:26pm

That’s what I get for trying to do logic at 1 am. What I really missed was the part of the original post which states “which are different from the original ones”. Good catch.

pauljurczak · July 1, 2020, 9:36pm

A bit faster version by reordering if statements with most often taken ones on the top:

function twoextremas_loop4(a)
  mn,min2 = Inf,Inf
  mx,max2 = -Inf,-Inf

  for item ∈ a
    if mn < item < min2
      min2 = item
    elseif max2 < item < mx
      max2 = item
    elseif item < mn
      min2 = mn
      mn = item
    elseif item > mx
      max2 = mx
      mx = item
    end
  end
  
  return (mn,mx),(min2,max2)
end

I’m tempted to write a SIMD version as an educational exercise, but this may be a bridge too far…

klaff · July 2, 2020, 12:03am

Well, here’s a threaded version:


function twoextremas(a)
    e = extrema(a)
    e2 = filter(!in(e), a) |> extrema
    return e,e2
end

function twoextremas_loop3(a)
  mn,min2 = Inf,Inf
  mx,max2 = -Inf,-Inf
  for item ∈ a
      if item < mn
          min2 = mn
          mn = item
      elseif mn < item < min2
          min2 = item
      elseif item > mx
          max2 = mx
          mx = item
      elseif max2 < item < mx
          max2 = item
      end
  end
  return (mn,mx),(min2,max2)
end

function thread_twoextremas(a)
    N = Threads.nthreads()
    results = Array{Float64}(undef, 4*N)
    chunk = length(a)÷N
    Threads.@threads for i ∈ 1:N
        lowindex = length(a)÷N*(i-1)+1
        highindex = i==N ? length(a) : length(a)÷N*i
        x,y = twoextremas_loop3(view(a,lowindex:highindex))
        results[4*Threads.threadid()] = x[1] 
        results[4*Threads.threadid()-1] = x[2] 
        results[4*Threads.threadid()-2] = y[1] 
        results[4*Threads.threadid()-3] = y[2]
    end
    return twoextremas_loop3(results)
end 

a = randn(1_000_000)

@assert twoextremas(a) == twoextremas_loop(a)
@assert twoextremas(a) == thread_twoextremas(a)

@btime twoextremas($a)
@btime twoextremas_loop3($a)
@btime thread_twoextremas($a)

which yields

14.983 ms (2 allocations: 7.63 MiB)
1.642 ms (0 allocations: 0 bytes)
  382.900 μs (32 allocations: 5.38 KiB)

This is really crude but I’m getting 4.3x speedup from 6 cores, which doesn’t seem too bad.

Topic		Replies	Views
Perf improvement suggestion for simple extrema Performance proposal	4	644	February 18, 2020
Find n smallest values in an array New to Julia question	5	2931	June 27, 2020
How to find the index of the two largest values in a 1D array New to Julia question	8	460	May 25, 2023
Feature request: generalization of findmax/findmin to n values Internals	5	2601	September 17, 2022
What is Julia's maxk(Matlab) that returns the indice of top K largest values? General Usage sort , sortperm	20	11493	May 9, 2023

Idiomatic way to find extreme and the next (less) extreme unique values

Related topics