X is a matrix and sub is a subset of columns. I’m trying to find the most efficient way to compute X’X for the submatrix. What I find is that first allocating X[sub,:] is significantly faster than using `view(X,sub,:)`

, but the extra memory allocation is wasteful. Is there a better way to do this? I can preallocate a buffer array but it doesn’t seem like that should be necessary.

```
using BenchmarkTools
using LinearAlgebra
using StatsBase: sample
function XXtview(Xt::Matrix{Float64}, sub::Vector{Int})
Xt_sub = view(Xt, sub, :)
XXt = Xt_sub * Xt_sub'
return XXt
end
function XXtcopy(Xt::Matrix{Float64}, sub::Vector{Int})
Xt_sub = Xt[sub, :]
XXt = Xt_sub * Xt_sub'
return XXt
end
function XXtbuffer(Xt::Matrix{Float64}, sub::Vector{Int}, Xt_sub::Matrix{Float64})
copyto!(Xt_sub, @view(Xt[sub, :]))
XXt = Xt_sub * Xt_sub'
return XXt
end
n, p = 1000, 500
nsub = 10
Xt = randn(p, n)
sub = sample(1:p, nsub)
Xt_sub = randn(nsub, n)
@assert all(XXtcopy(Xt, sub) .≈ XXtview(Xt, sub))
@assert all(XXtcopy(Xt, sub) .≈ XXtbuffer(Xt, sub, Xt_sub))
@benchmark XXtview($Xt, $sub)
```

```
BenchmarkTools.Trial:
memory estimate: 1.31 KiB
allocs estimate: 10
--------------
minimum time: 99.061 μs (0.00% GC)
median time: 99.882 μs (0.00% GC)
mean time: 101.643 μs (0.00% GC)
maximum time: 210.991 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
```

```
@benchmark XXtcopy($Xt, $sub)
```

```
BenchmarkTools.Trial:
memory estimate: 79.14 KiB
allocs estimate: 5
--------------
minimum time: 40.986 μs (0.00% GC)
median time: 47.674 μs (0.00% GC)
mean time: 61.212 μs (22.19% GC)
maximum time: 54.852 ms (99.72% GC)
--------------
samples: 10000
evals/sample: 1
```

```
@benchmark XXtbuffer($Xt, $sub, $Xt_sub)
```

```
BenchmarkTools.Trial:
memory estimate: 1008 bytes
allocs estimate: 4
--------------
minimum time: 44.912 μs (0.00% GC)
median time: 45.849 μs (0.00% GC)
mean time: 49.861 μs (0.00% GC)
maximum time: 284.885 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
```