The following is the simplified demo code that illustrates the problem

```
using OffsetArrays
using LinearAlgebra
using SparseArrays
using LoopVectorization
const L = 24
const mass = 1.0
const basisN = 4341
kList = [0, 12]
psi = zeros(ComplexF64, basisN)
#=============================================================================#
# constructHam
#=============================================================================#
function constructHam(k::Int64, mass::Float64)
Ham = spzeros(ComplexF64, basisN, basisN)
for _ in 1: L*basisN
i = trunc(Int, rand()*basisN) + 1
j = trunc(Int, rand()*basisN) + 1
Ham[i, j] += mass*rand()
Ham[j, i] = Ham[i, j]
end
return Ham
end
#=============================================================================#
# get Hamiltonian and eigensystem in all momentum sectors
#=============================================================================#
function getHam_k(mass::Float64)
Ham_k = SparseMatrixCSC{ComplexF64,Int64}[]
for k in 0:L-1
if k in kList
Ham = constructHam(k, mass)
else
Ham = spzeros(ComplexF64, 0, 0)
end
push!(Ham_k, Ham)
end
return OffsetVector(Ham_k, 0:L-1)
end
const Ham_k = getHam_k(mass)
#------------------------------------------------------------------------------
# get eigvecs and eigvals
#------------------------------------------------------------------------------
function getEigenSystem()
eigvals_k = Vector{Float64}[]
eigvecs_k = Matrix{ComplexF64}[]
for k in 0:L-1
if k in kList
println("Diagonalizing momentum k = $k")
# Hermitian necessary to make sure eigvecs all orthonormal
eigvals, eigvecs = eigen(Hermitian(Matrix(Ham_k[k])))
else
eigvals = Vector{Float64}(undef, 0)
eigvecs = Matrix{ComplexF64}(undef, 0, 0)
end
push!(eigvals_k, eigvals)
push!(eigvecs_k, eigvecs)
end
return OffsetVector(eigvals_k, 0:L-1), OffsetVector(eigvecs_k, 0:L-1)
end
const eigvals_k, eigvecs_k = getEigenSystem()
function get_coeffs_k()
coeffs_k = Vector{ComplexF64}[]
for k in 0:L-1
if k in kList
# coeffs = rand(ComplexF64, basisN)
eigvecs = eigvecs_k[k]
coeffs = [conj(eigvecs[trunc(Int, rand()*basisN)+1, i]) for i in 1:basisN]
else
coeffs = ComplexF64[]
end
push!(coeffs_k, coeffs)
end
return OffsetVector(coeffs_k, 0:L-1)
end
const coeffs_k = get_coeffs_k()
#------------------------------------------------------------------------------
#
#------------------------------------------------------------------------------
const T = ComplexF64
function constructPsi_0!(coeffs_k::OffsetArray{Vector{T},1, Vector{Vector{T}}}, eigvecs_k:: OffsetArray{Matrix{T},1, Vector{Matrix{T}}}, ψ::Vector{T})::Nothing where T
fill!(ψ, 0.0)
phi = Vector{ComplexF64}(undef, basisN)
for k in kList
coeffs = coeffs_k[k]
eigvecs= eigvecs_k[k]
vec = zeros(ComplexF64, basisN)
for n in 1:basisN
@. vec += coeffs_k[k][n] * (@view eigvecs[:, n])
end
@. ψ += vec
end
ψ[:] /= norm(ψ)
return nothing
end
function constructPsi_1!(coeffs_k::OffsetArray{Vector{T},1, Vector{Vector{T}}}, eigvecs_k:: OffsetArray{Matrix{T},1, Vector{Matrix{T}}}, ψ::Vector{T})::Nothing where T
fill!(ψ, zero(T))
for k in kList
coeffs = coeffs_k[k]
eigvecs = eigvecs_k[k]
vec = zeros(T, basisN)
for (i, c) in pairs(coeffs)
@avx for j in eachindex(vec)
vec[j] += c * eigvecs[j,i]
end
end
@. ψ += vec
end
ψ[:] /= norm(ψ)
return nothing
end
```

The number of the computation in `constructPsi!`

is `~4341*4341*2`

, with GHz computer, the optimal running time should `~ 4341*4341*2/1E9 ~ 40ms`

. However,

```
julia> @time constructPsi_1!(coeffs_k, eigvecs_k, psi)
3.660657 seconds (179.70 M allocations: 4.365 GiB, 18.60% gc time)
```