After some tests, we can write code with just loops only
function ComputeBasis_No_Vec!(MX::AbstractMatrix{Float64}, X::AbstractMatrix{Float64}, op::HermiteOperation)
tmp = X[:, op.init_domain]
@views for (i, x) in enumerate(eachcol(MX))
for j in eachindex(x)
if i == 1
x[j] = tmp[j]
continue
end
if i == 2
x[j] = tmp[j] * tmp[j] - 1
continue
end
x[j] = tmp[j] * MX[j, op.range[i-1]] - (i-1) * MX[j, op.range[i - 2]]
end
end
MX
end
which is nearly 2X with my benchmarks .
julia> @btime RunTest()
2.699 μs (17 allocations: 1.33 KiB)
julia> @btime RunTest_No_Vec()
1.287 μs (10 allocations: 800 bytes)