What's the best way to separate elements of a vector based on its values?

Hi all,

What’s the best way to separate elements of a vector based on the value of the last member?

All = [ (15,12,875), (25,132, 333),  (7,9, 201), (147,92, 201), (77,59, 333), (15,11,875), (10,12,875), (4,12,875), (25,132,875), (25,132, 31),  (7,9, 333), (154,92, 201), (36,55, 31), (125,18,875), (10,12,33)]

What I need is a systematic way to first find all elements with the same last index and then put them into a collection of vectors. For the above example this would be:

Separated = [  
[(15,12,875), (15,11,875), (10,12,875), (4,12,875), (25,132,875), (125,18,875) ],
[ (25,132, 333), (77,59, 333),  (7,9, 333)  ].
[ (7,9, 201),  (147,92, 201), (154,92, 201) ],
[ (25,132, 31),  (36,55, 31)],
[(10,12,33)]


   ] 

Here’s my try: first find all last elemnts then?

All = [ (15,12,875), (25,132, 333),  (7,9, 201), (147,92, 201), (77,59, 333), (15,11,875), (10,12,875), (4,12,875), (25,132,875), (25,132, 31),  (7,9, 333), (154,92, 201), (36,55, 31), (125,18,875), (10,12,33)]

u1 = zeros(length(All))
for i in eachindex(All)
    u1[i] = All[i][3]
end 
u2 =unique(u1)

for i in u2
    #find the elements and put them into an array

This would be my attempt:

julia> All = [ (15,12,875), (25,132, 333),  (7,9, 201), (147,92, 201), (77,59, 333), (15,11,875), (10,12,875), (4,12,875), (25,132,875), (25,132, 31),  (7,9, 333), (154,92, 201), (36,55, 31), (125,18,875), (10,12,33)];

julia> separated = [ All[findall(x -> x[3] == li, All)] for li in unique(getindex.(All,3)) ]
5-element Vector{Vector{Tuple{Int64, Int64, Int64}}}:
 [(15, 12, 875), (15, 11, 875), (10, 12, 875), (4, 12, 875), (25, 132, 875), (125, 18, 875)]
 [(25, 132, 333), (77, 59, 333), (7, 9, 333)]
 [(7, 9, 201), (147, 92, 201), (154, 92, 201)]
 [(25, 132, 31), (36, 55, 31)]
 [(10, 12, 33)]

2 Likes

Depending upon your needs consider

u3 = sortperm(u1; rev=true)
1 Like

SplitApplyCombine.jl is good for this

julia> using SplitApplyCombine

julia> All = [ (15,12,875), (25,132, 333),  (7,9, 201), (147,92, 201), (77,59, 333), (15,11,875), (10,12,875), (4,12,875), (25,132,875), (25,132, 31),  (7,9, 333), (154,92, 201), (36,55, 31), (125,18,875), (10,12,33)];

julia> group(last, All)5-element Dictionaries.Dictionary{Int64, Vector{Tuple{Int64, Int64, Int64}}}
 875 │ [(15, 12, 875), (15, 11, 875), (10, 12, 875), (4, 12, 875), (25, 13…
 333 │ [(25, 132, 333), (77, 59, 333), (7, 9, 333)]
 201 │ [(7, 9, 201), (147, 92, 201), (154, 92, 201)]
  31 │ [(25, 132, 31), (36, 55, 31)]
  33 │ [(10, 12, 33)]
4 Likes

I also believe that SAC’s group function is the best way to solve this problem.
But, if you don’t want to depend on any package, you might want to consider such a solution below

d=Dict{Int, Vector{NTuple{3, Int}}}()

foreach(e->push!(get!(()->[], d,last(e)),e), All)

values(d)
2 Likes

If the function by which to group is not expensive to evaluate, the following is a simple implementation:

function group(itr; by = identity)
    return [[item for item in itr if by(item) == val] for val in unique(by.(itr))]
end

which gives what you want:

julia> @time Separated = group(All, by=last)
  0.000009 seconds (23 allocations: 2.438 KiB)
5-element Vector{Vector{Tuple{Int64, Int64, Int64}}}:
 [(15, 12, 875), (15, 11, 875), (10, 12, 875), (4, 12, 875), (25, 132, 875), (125, 18, 875)]
 [(25, 132, 333), (77, 59, 333), (7, 9, 333)]
 [(7, 9, 201), (147, 92, 201), (154, 92, 201)]
 [(25, 132, 31), (36, 55, 31)]
 [(10, 12, 33)]

You can also make a lazy version

function lazygroup(itr; by = identity)
    return ((item for item in itr if by(item) == val) for val in unique(by.(itr)))
end

Then this will only allocate to identify the unique values of applying by. You can then broadcast collect to get back everything:

julia> @time groups = lazygroup(l, by = last)
  0.000007 seconds (8 allocations: 752 bytes)
Base.Generator{Vector{Int64}, var"#198#200"{typeof(last), Vector{Tuple{Int64, Int64, Int64}}}}(var"#198#200"{typeof(last), Vector{Tuple{Int64, Int64, Int64}}}(last, [(15, 12, 875), (25, 132, 333), (7, 9, 201), (147, 92, 201), (77, 59, 333), (15, 11, 875), (10, 12, 875), (4, 12, 875), (25, 132, 875), (25, 132, 31), (7, 9, 333), (154, 92, 201), (36, 55, 31), (125, 18, 875), (10, 12, 33)]), [875, 333, 201, 31, 33])

julia> collect.(groups)
5-element Vector{Vector{Tuple{Int64, Int64, Int64}}}:
 [(15, 12, 875), (15, 11, 875), (10, 12, 875), (4, 12, 875), (25, 132, 875), (125, 18, 875)]
 [(25, 132, 333), (77, 59, 333), (7, 9, 333)]
 [(7, 9, 201), (147, 92, 201), (154, 92, 201)]
 [(25, 132, 31), (36, 55, 31)]
 [(10, 12, 33)]

but you can also write two nested for loops in order to access each element by group and then by element:

for g in groups
    for el in g
        # do something
    end
end
2 Likes

why seperate1 is faster than separate0?

julia>
       function separate0(A; by=identity)
           d=Dict{Int, Vector{NTuple{3, Int}}}()
           foreach(e->push!(get!(()->Vector{NTuple{3, Int}}(), d,by(e)),e), A)
           return values(d)
       end



       function separate1(A; by=identity)
           d=Dict{Int, Vector{Any}}()
           foreach(e->push!(get!(()->[], d,by(e)),e), A)
           return values(d)
       end


       function separate2(A; by=identity)
           T=eltype(A)
           d=Dict{Int, Vector{T}}()
           foreach(e->push!(get!(()->Vector{T}(), d,by(e)),e), A)       
           return values(d)
       end
separate2 (generic function with 1 method)

julia>
       function lazygroup(itr; by = identity)
           return ((item for item in itr if by(item) == val) for val in unique(by.(itr)))
       end
lazygroup (generic function with 1 method)

julia> function groupss(itr; by = identity)
           return [[item for item in itr if by(item) == val] for val in unique(by.(itr))]
       end
groupss (generic function with 1 method)

julia>  
       using BenchmarkTools
       
       All = [ (15,12,875), (25,132, 333),  (7,9, 201), (147,92, 201), (77,59, 333), (15,11,875), (10,12,875), (4,12,875), (25,132,875), (25,132, 31),  (7,9, 333), (154,92, 201), (36,55, 31), (125,18,875), (10,12,33)]

       bigAll=repeat(All,10^5)
1500000-element Vector{Tuple{Int64, Int64, Int64}}:

julia> @btime separate0(bigAll;by=last);
  38.402 ms (68 allocations: 67.94 MiB)

julia> @btime separate1(bigAll;by=last);
  31.492 ms (1500068 allocations: 70.80 MiB)

julia> @btime separate2(bigAll;by=last);
  70.811 ms (1500068 allocations: 113.71 MiB)

julia> @btime collect.(lazygroup(bigAll, by = last));
  42.371 ms (82 allocations: 79.38 MiB)

julia> @btime groupss(bigAll, by = last);
  42.667 ms (77 allocations: 79.38 MiB)

julia> @btime [ bigAll[findall(x -> x[3] == li, bigAll)] for li in unique(getindex.(bigAll,3)) ];
  36.519 ms (69 allocations: 58.14 MiB)

julia> using SplitApplyCombine

julia> @btime group(last, bigAll);
  36.968 ms (82 allocations: 67.94 MiB)