X is a matrix and sub is a subset of columns. I’m trying to find the most efficient way to compute X’X for the submatrix. What I find is that first allocating X[sub,:] is significantly faster than using view(X,sub,:), but the extra memory allocation is wasteful. Is there a better way to do this? I can preallocate a buffer array but it doesn’t seem like that should be necessary.
using BenchmarkTools
using LinearAlgebra
using StatsBase: sample
function XXtview(Xt::Matrix{Float64}, sub::Vector{Int})
    Xt_sub = view(Xt, sub, :)
    XXt = Xt_sub * Xt_sub'
    return XXt
end
function XXtcopy(Xt::Matrix{Float64}, sub::Vector{Int})
    Xt_sub = Xt[sub, :]
    XXt = Xt_sub * Xt_sub'
    return XXt
end
function XXtbuffer(Xt::Matrix{Float64}, sub::Vector{Int}, Xt_sub::Matrix{Float64})
    copyto!(Xt_sub, @view(Xt[sub, :]))
    XXt = Xt_sub * Xt_sub'
    return XXt
end
n, p = 1000, 500
nsub = 10
Xt = randn(p, n)
sub = sample(1:p, nsub)
Xt_sub = randn(nsub, n)
@assert all(XXtcopy(Xt, sub) .≈ XXtview(Xt, sub))
@assert all(XXtcopy(Xt, sub) .≈ XXtbuffer(Xt, sub, Xt_sub))
@benchmark XXtview($Xt, $sub)
BenchmarkTools.Trial: 
  memory estimate:  1.31 KiB
  allocs estimate:  10
  --------------
  minimum time:     99.061 μs (0.00% GC)
  median time:      99.882 μs (0.00% GC)
  mean time:        101.643 μs (0.00% GC)
  maximum time:     210.991 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
@benchmark XXtcopy($Xt, $sub)
BenchmarkTools.Trial: 
  memory estimate:  79.14 KiB
  allocs estimate:  5
  --------------
  minimum time:     40.986 μs (0.00% GC)
  median time:      47.674 μs (0.00% GC)
  mean time:        61.212 μs (22.19% GC)
  maximum time:     54.852 ms (99.72% GC)
  --------------
  samples:          10000
  evals/sample:     1
@benchmark XXtbuffer($Xt, $sub, $Xt_sub)
BenchmarkTools.Trial: 
  memory estimate:  1008 bytes
  allocs estimate:  4
  --------------
  minimum time:     44.912 μs (0.00% GC)
  median time:      45.849 μs (0.00% GC)
  mean time:        49.861 μs (0.00% GC)
  maximum time:     284.885 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1