I am de-interleaving data that has been stored as a vector. I would like to store the data as a matrix. I have tried and performance tested 3 methods, and am looking for any comments on how to improve it.

And example is:

```
julia> a = collect(1:20.0);
julia> deinterleavef(a, 4, 5)
5×4 Matrix{Float64}:
1.0 2.0 3.0 4.0
5.0 6.0 7.0 8.0
9.0 10.0 11.0 12.0
13.0 14.0 15.0 16.0
17.0 18.0 19.0 20.0
```

I generated 3 functions to determine the relative performance. These functions can be classified as reshape/permutedims, slice, and for loop and are shown as follows:

```
"""
function deinterleaver(data, num_channels, num_samples_per_channel)
Convert interleaved vector to Matrix using reshape and permutedims (probably an expensive way to do it)
"""
function deinterleaver(data::Vector{T}, num_channels::Integer, num_samples_per_channel::Integer) where T <: AbstractFloat
if length(data) == num_channels * num_samples_per_channel
data = reshape(data, num_channels, num_samples_per_channel)
data = permutedims(data)
else
error("Dimension mismatch")
end
return data
end
"""
function deinterleaves(data::Vector{T}, num_channels::Integer, num_samples_per_channel::Integer) where T <: AbstractFloat
Slice the data (not sure of the cost of the slicing)
"""
function deinterleaves(data::Vector{T}, num_channels::Integer, num_samples_per_channel::Integer) where T <: AbstractFloat
num_samples = length(data)
# @show(num_channels, num_samples_per_channel, num_samples)
if num_samples == num_channels * num_samples_per_channel
datamatrix = similar(data, num_samples_per_channel, num_channels)
for c in 1:num_channels
datamatrix[:,c] = data[c:num_channels:num_samples]
end
else
error("Dimension mismatch")
end
return datamatrix
end
"""
function deinterleavef(data::Vector{T}, num_channels::Integer, num_samples_per_channel::Integer) where T <: AbstractFloat
Use for loops
"""
function deinterleavef(data::Vector{T}, num_channels::Integer, num_samples_per_channel::Integer) where T <: AbstractFloat
num_samples = length(data)
if num_samples == num_channels * num_samples_per_channel
datamatrix = similar(data, num_samples_per_channel, num_channels)
for c in 1:num_channels
@inbounds for d in 1:num_samples_per_channel
datamatrix[d,c] = data[c + (d-1)*num_channels]
end
end
else
error("Dimension mismatch")
end
return datamatrix
end
```

Running this on an 8 G Raspberry PI 4 I obtain the following performance metrics

```
julia> a = collect(1:2_000_000.0);
julia> @btime deinterleaver(a, 4, Integer(round(length(a)/4)));
56.920 ms (8 allocations: 15.26 MiB)
julia> @btime deinterleaves(a, 4, Integer(round(length(a)/4)));
105.602 ms (14 allocations: 30.52 MiB)
julia> @btime deinterleavef(a, 4, Integer(round(length(a)/4)));
57.043 ms (6 allocations: 15.26 MiB)
and waiting a while showing probably thermal effects on the CPU
julia> @btime deinterleavef(a, 4, Integer(round(length(a)/4)));
56.732 ms (6 allocations: 15.26 MiB)
```

It seems like a toss-up for the reshape/permutedims or the for loop implementations which are about twice as fast and half the memory of the slicing implementation. The for loop has opportunity for parallelization.

What are the optimizations still possible, or is there another method that I am missing?