Idiomatic way to find extreme and the next (less) extreme unique values

I have:

julia> a = [3, 1, 4, 1, Inf, Inf]
julia> extrema(a)
(1.0, Inf)

I would like to find the next pair of extreme values of array a , which are different from the original ones, i.e. laying inside (1.0, Inf) interval. In the case above it will be the (3, 4) tuple. What is the idiomatic way of expressing it? Sorting is fine, I don’t care about performance here.

I think this should do it? Maybe not exactly.

julia> function narrow(x, n)
       xs = sort!(unique(x))
       return (xs[n], xs[length(xs) - n + 1])
       end
1 Like

or your can

e = extrema(a)
filter(!in(e), a) |> extrema
1 Like

Very nice, thank you. The specific use case I was looking for, is a variant of your solution:

filter(!in(Inf), a) |> extrema

or

filter(!in((-Inf, Inf)), a) |> extrema

You also have partialsort but I’m not sure how efficient it is

julia> a = randn(10)
10-element Array{Float64,1}:
 -0.3445076168022343
  1.0346173940108812
 -1.0549856989794997
  0.42557277939095345
  0.41959768717730606
  0.33370566251639655
 -0.6417930608565745
 -0.2651253038034898
  1.738265093629218
  1.6703669085548287

julia> partialsort(a, 1:2)
2-element view(::Array{Float64,1}, 1:2) with eltype Float64:
 -1.0549856989794997
 -0.6417930608565745

julia> partialsort(a, 9:10)
2-element view(::Array{Float64,1}, 9:10) with eltype Float64:
 1.6703669085548287
 1.738265093629218
1 Like

Good to know, but this approach is a bit problematic due to uniqueness requirement. It would have to go like this:

b = unique(a)
partialsort(b, 1:2)
partialsort(b, length(b)-1:length(b))

assuming that length(b) > 1

If performance were on the table, it would be hard to beat looping over the elements and keeping track of the two smallest and two largest values. For conciseness the filtering solution looks perfectly fine.

2 Likes

Haha, I was just trying this and the logic was surprisingly confusing to me, but the advantage is rather large:


function twoextremas(a)
    e = extrema(a)
    e2 = filter(!in(e), a) |> extrema
    return e,e2
end

function twoextremas_loop(a)
    mn,min2 = Inf,Inf
    mx,max2 = -Inf,-Inf
    for item ∈ a
        if item < mn
            min2 = mn
            mn = item
        elseif item < min2
            min2 = item
        end
        if item > mx
            max2 = mx
            mx = item
        elseif item > max2
            max2 = item
        end
    end
    return (mn,mx),(min2,max2)
end

a = randn(1_000_000)

@assert twoextremas(a) == twoextremas_loop(a)

@btime twoextremas($a)
@btime twoextremas_loop($a)

which gave an order of magnitude in performance:

15.287 ms (2 allocations: 7.63 MiB)
1.428 ms (0 allocations: 0 bytes)

[EDIT - as pointed out below, this function does not meet the desired specification]
2 Likes

Timings on my PC are so close to 10x, they almost look fake :wink:

  13.558 ms (2 allocations: 7.63 MiB)
  1.358 ms (0 allocations: 0 bytes)

Can make it quite a bit faster by moving the second if statement into the first

function twoextremas_loop2(a)
    mn,min2 = Inf,Inf
    mx,max2 = -Inf,-Inf
    for item ∈ a
        if item < mn
            min2 = mn
            mn = item
        elseif item < min2
            min2 = item
        else
            if item > mx
                max2 = mx
                mx = item
            elseif item > max2
                max2 = item
            end
        end
    end
    return (mn,mx),(min2,max2)
end


julia> @btime twoextremas($a)
  12.151 ms (2 allocations: 7.63 MiB)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))

julia> @btime twoextremas_loop($a)
  1.469 ms (0 allocations: 0 bytes)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))

julia> @btime twoextremas_loop2($a)
  992.839 μs (0 allocations: 0 bytes)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))
3 Likes

Small correction is needed for it to work correctly on arrays with repetitions, e.g. [3, 1, 4, 1, Inf, Inf]

function twoextremas_loop3(a)
  mn,min2 = Inf,Inf
  mx,max2 = -Inf,-Inf

  for item ∈ a
      if item < mn
          min2 = mn
          mn = item
      elseif mn < item < min2
          min2 = item
      elseif item > mx
          max2 = mx
          mx = item
      elseif max2 < item < mx
          max2 = item
      end
  end
  
  return (mn,mx),(min2,max2)
end
1 Like

That’s what I get for trying to do logic at 1 am. What I really missed was the part of the original post which states “which are different from the original ones”. Good catch.

A bit faster version by reordering if statements with most often taken ones on the top:

function twoextremas_loop4(a)
  mn,min2 = Inf,Inf
  mx,max2 = -Inf,-Inf

  for item ∈ a
    if mn < item < min2
      min2 = item
    elseif max2 < item < mx
      max2 = item
    elseif item < mn
      min2 = mn
      mn = item
    elseif item > mx
      max2 = mx
      mx = item
    end
  end
  
  return (mn,mx),(min2,max2)
end

I’m tempted to write a SIMD version as an educational exercise, but this may be a bridge too far…

Well, here’s a threaded version:


function twoextremas(a)
    e = extrema(a)
    e2 = filter(!in(e), a) |> extrema
    return e,e2
end

function twoextremas_loop3(a)
  mn,min2 = Inf,Inf
  mx,max2 = -Inf,-Inf
  for item ∈ a
      if item < mn
          min2 = mn
          mn = item
      elseif mn < item < min2
          min2 = item
      elseif item > mx
          max2 = mx
          mx = item
      elseif max2 < item < mx
          max2 = item
      end
  end
  return (mn,mx),(min2,max2)
end

function thread_twoextremas(a)
    N = Threads.nthreads()
    results = Array{Float64}(undef, 4*N)
    chunk = length(a)÷N
    Threads.@threads for i ∈ 1:N
        lowindex = length(a)÷N*(i-1)+1
        highindex = i==N ? length(a) : length(a)÷N*i
        x,y = twoextremas_loop3(view(a,lowindex:highindex))
        results[4*Threads.threadid()] = x[1] 
        results[4*Threads.threadid()-1] = x[2] 
        results[4*Threads.threadid()-2] = y[1] 
        results[4*Threads.threadid()-3] = y[2]
    end
    return twoextremas_loop3(results)
end 

a = randn(1_000_000)

@assert twoextremas(a) == twoextremas_loop(a)
@assert twoextremas(a) == thread_twoextremas(a)

@btime twoextremas($a)
@btime twoextremas_loop3($a)
@btime thread_twoextremas($a)

which yields

14.983 ms (2 allocations: 7.63 MiB)
1.642 ms (0 allocations: 0 bytes)
  382.900 μs (32 allocations: 5.38 KiB)

This is really crude but I’m getting 4.3x speedup from 6 cores, which doesn’t seem too bad.

3 Likes