I have:
julia> a = [3, 1, 4, 1, Inf, Inf]
julia> extrema(a)
(1.0, Inf)
I would like to find the next pair of extreme values of array a
, which are different from the original ones, i.e. laying inside (1.0, Inf)
interval. In the case above it will be the (3, 4)
tuple. What is the idiomatic way of expressing it? Sorting is fine, I don’t care about performance here.
I think this should do it? Maybe not exactly.
julia> function narrow(x, n)
xs = sort!(unique(x))
return (xs[n], xs[length(xs) - n + 1])
end
1 Like
jling
July 1, 2020, 3:18am
3
or your can
e = extrema(a)
filter(!in(e), a) |> extrema
1 Like
Very nice, thank you. The specific use case I was looking for, is a variant of your solution:
filter(!in(Inf), a) |> extrema
or
filter(!in((-Inf, Inf)), a) |> extrema
You also have partialsort
but I’m not sure how efficient it is
julia> a = randn(10)
10-element Array{Float64,1}:
-0.3445076168022343
1.0346173940108812
-1.0549856989794997
0.42557277939095345
0.41959768717730606
0.33370566251639655
-0.6417930608565745
-0.2651253038034898
1.738265093629218
1.6703669085548287
julia> partialsort(a, 1:2)
2-element view(::Array{Float64,1}, 1:2) with eltype Float64:
-1.0549856989794997
-0.6417930608565745
julia> partialsort(a, 9:10)
2-element view(::Array{Float64,1}, 9:10) with eltype Float64:
1.6703669085548287
1.738265093629218
1 Like
Good to know, but this approach is a bit problematic due to uniqueness requirement. It would have to go like this:
b = unique(a)
partialsort(b, 1:2)
partialsort(b, length(b)-1:length(b))
assuming that length(b) > 1
If performance were on the table, it would be hard to beat looping over the elements and keeping track of the two smallest and two largest values. For conciseness the filtering solution looks perfectly fine.
2 Likes
klaff
July 1, 2020, 6:09am
8
Haha, I was just trying this and the logic was surprisingly confusing to me, but the advantage is rather large:
function twoextremas(a)
e = extrema(a)
e2 = filter(!in(e), a) |> extrema
return e,e2
end
function twoextremas_loop(a)
mn,min2 = Inf,Inf
mx,max2 = -Inf,-Inf
for item ∈ a
if item < mn
min2 = mn
mn = item
elseif item < min2
min2 = item
end
if item > mx
max2 = mx
mx = item
elseif item > max2
max2 = item
end
end
return (mn,mx),(min2,max2)
end
a = randn(1_000_000)
@assert twoextremas(a) == twoextremas_loop(a)
@btime twoextremas($a)
@btime twoextremas_loop($a)
which gave an order of magnitude in performance:
15.287 ms (2 allocations: 7.63 MiB)
1.428 ms (0 allocations: 0 bytes)
[EDIT - as pointed out below, this function does not meet the desired specification]
2 Likes
Timings on my PC are so close to 10x, they almost look fake
13.558 ms (2 allocations: 7.63 MiB)
1.358 ms (0 allocations: 0 bytes)
Can make it quite a bit faster by moving the second if statement into the first
function twoextremas_loop2(a)
mn,min2 = Inf,Inf
mx,max2 = -Inf,-Inf
for item ∈ a
if item < mn
min2 = mn
mn = item
elseif item < min2
min2 = item
else
if item > mx
max2 = mx
mx = item
elseif item > max2
max2 = item
end
end
end
return (mn,mx),(min2,max2)
end
julia> @btime twoextremas($a)
12.151 ms (2 allocations: 7.63 MiB)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))
julia> @btime twoextremas_loop($a)
1.469 ms (0 allocations: 0 bytes)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))
julia> @btime twoextremas_loop2($a)
992.839 μs (0 allocations: 0 bytes)
((-4.722967184880966, 5.913947095464424), (-4.448884781343994, 4.541658686526455))
3 Likes
Small correction is needed for it to work correctly on arrays with repetitions, e.g. [3, 1, 4, 1, Inf, Inf]
function twoextremas_loop3(a)
mn,min2 = Inf,Inf
mx,max2 = -Inf,-Inf
for item ∈ a
if item < mn
min2 = mn
mn = item
elseif mn < item < min2
min2 = item
elseif item > mx
max2 = mx
mx = item
elseif max2 < item < mx
max2 = item
end
end
return (mn,mx),(min2,max2)
end
1 Like
klaff
July 1, 2020, 1:26pm
12
That’s what I get for trying to do logic at 1 am. What I really missed was the part of the original post which states “which are different from the original ones”. Good catch.
A bit faster version by reordering if statements with most often taken ones on the top:
function twoextremas_loop4(a)
mn,min2 = Inf,Inf
mx,max2 = -Inf,-Inf
for item ∈ a
if mn < item < min2
min2 = item
elseif max2 < item < mx
max2 = item
elseif item < mn
min2 = mn
mn = item
elseif item > mx
max2 = mx
mx = item
end
end
return (mn,mx),(min2,max2)
end
I’m tempted to write a SIMD version as an educational exercise, but this may be a bridge too far…
klaff
July 2, 2020, 12:03am
14
Well, here’s a threaded version:
function twoextremas(a)
e = extrema(a)
e2 = filter(!in(e), a) |> extrema
return e,e2
end
function twoextremas_loop3(a)
mn,min2 = Inf,Inf
mx,max2 = -Inf,-Inf
for item ∈ a
if item < mn
min2 = mn
mn = item
elseif mn < item < min2
min2 = item
elseif item > mx
max2 = mx
mx = item
elseif max2 < item < mx
max2 = item
end
end
return (mn,mx),(min2,max2)
end
function thread_twoextremas(a)
N = Threads.nthreads()
results = Array{Float64}(undef, 4*N)
chunk = length(a)÷N
Threads.@threads for i ∈ 1:N
lowindex = length(a)÷N*(i-1)+1
highindex = i==N ? length(a) : length(a)÷N*i
x,y = twoextremas_loop3(view(a,lowindex:highindex))
results[4*Threads.threadid()] = x[1]
results[4*Threads.threadid()-1] = x[2]
results[4*Threads.threadid()-2] = y[1]
results[4*Threads.threadid()-3] = y[2]
end
return twoextremas_loop3(results)
end
a = randn(1_000_000)
@assert twoextremas(a) == twoextremas_loop(a)
@assert twoextremas(a) == thread_twoextremas(a)
@btime twoextremas($a)
@btime twoextremas_loop3($a)
@btime thread_twoextremas($a)
which yields
14.983 ms (2 allocations: 7.63 MiB)
1.642 ms (0 allocations: 0 bytes)
382.900 μs (32 allocations: 5.38 KiB)
This is really crude but I’m getting 4.3x speedup from 6 cores, which doesn’t seem too bad.
3 Likes