Hi there performance fanatics!

@amontoison and I are working on our matrix coloring code, and in one of our algorithms we need to split a vector into groups, one for each color. Semantically, this is what we want to achieve

```
function split_into_color_groups(colors::Vector{Int})
cmax = maximum(colors)
groups = [Int[] for c in 1:cmax]
for (i, c) in enumerate(colors)
push!(groups[c], i)
end
return groups
end
```

And here’s an example:

```
julia> colors = rand(1:3, 10):
julia> colors'
1×10 adjoint(::Vector{Int64}) with eltype Int64:
1 3 3 1 2 2 2 2 2 1
julia> split_into_color_groups(colors)
3-element Vector{Vector{Int64}}:
[1, 4, 10]
[5, 6, 7, 8, 9]
[2, 3]
```

Our goal is to save on the costs of memory allocations, both in terms of storage space and execution time. Alexis already made a first improvement which precomputes the correct size for each group to avoid `push!`

, and now we’re left wondering whether to use

- Option A: a vector of vectors, as above
- Option B: a vector of views into the same underlying flat vector

Do you have any insights? Specifically, **even though both options allocate the same number of bytes, is there a point in trying to reduce the number of allocations itself?**

If you want to play around with benchmarks, here are the two optimized implementations.

```
function compute_group_sizes(colors::Vector{Int})
cmax = maximum(colors)
group_sizes = zeros(Int, cmax)
for c in colors
group_sizes[c] += 1
end
return group_sizes
end
function split_vecvec(colors::Vector{Int})
group_sizes = compute_group_sizes(colors)
groups = [Vector{Int}(undef, group_sizes[c]) for c in eachindex(group_sizes)]
fill!(group_sizes, 0)
for (k, c) in enumerate(colors)
group_sizes[c] += 1
pos = group_sizes[c]
groups[c][pos] = k
end
return groups
end
function split_vecview(colors::Vector{Int})
group_sizes = compute_group_sizes(colors)
group_offsets = cumsum(group_sizes)
groups_flat = similar(colors)
for (k, c) in enumerate(colors)
i = group_offsets[c] - group_sizes[c] + 1
groups_flat[i] = k
group_sizes[c] -= 1
end
TV = typeof(view(groups_flat, 1:1))
groups = Vector{TV}(undef, length(group_sizes)) # allocation 4, size cmax
for c in eachindex(group_sizes)
i = 1 + (c == 1 ? 0 : group_offsets[c - 1])
j = group_offsets[c]
groups[c] = view(groups_flat, i:j)
end
return groups
end
```

And here is a benchmark loop:

```
using BenchmarkTools
for n in 10 .^ (2, 3, 4, 5), cmax in (3, 10, 30, 100)
yield()
bench_vecvec = @benchmark split_vecvec(_colors) setup = (_colors = rand(1:($cmax), $n))
bench_vecview = @benchmark split_vecview(_colors) setup = (
_colors = rand(1:($cmax), $n)
)
ratios = (
time=minimum(bench_vecvec).time / minimum(bench_vecview).time,
memory=minimum(bench_vecvec).memory / minimum(bench_vecview).memory,
allocs=minimum(bench_vecvec).allocs / minimum(bench_vecview).allocs,
)
@info "Vecvec / vecview ratios - n=$n, cmax=$cmax" ratios.time ratios.memory ratios.allocs
end
```

Apparently, with a very low number of colors, option A is better, but then option B quickly takes over.