These help a little, but not much:
julia> function vector_of_vector_sum_v2(x, μ, σ, n)
ℓ = -0.5n * log(2π * σ^2)
s = zero(eltype(eltype(x)))
fi = firstindex(x)
li = lastindex(x)
@assert (firstindex(μ) == fi) & (lastindex(μ) == li)
len = li - (fi-1)
for i = 0:(len>>2)-1
_x0 = x[4i+fi]
_μ0 = μ[4i+fi]
_x1 = x[4i+fi+1]
_μ1 = μ[4i+fi+1]
_x2 = x[4i+fi+2]
_μ2 = μ[4i+fi+2]
_x3 = x[4i+fi+3]
_μ3 = μ[4i+fi+3]
@turbo for j ∈ eachindex(_x0)
s += (_x0[j] - _μ0[j])^2 + (_x1[j] - _μ1[j])^2 + (_x2[j] - _μ2[j])^2 + (_x3[j] - _μ3[j])^2
end
end
for i = fi+(len&-4):li
_x, _μ = x[i], μ[i]
@turbo for j ∈ eachindex(_x, _μ)
s += (_x[j] - _μ[j])^2
end
end
ℓ -= 0.5s / σ^2
end
vector_of_vector_sum_v2 (generic function with 1 method)
julia> function vector_of_vector_sum_v3(x, μ, σ, n)
ℓ = -0.5n * log(2π * σ^2)
T = eltype(eltype(x))
s = vzero(pick_vector_width(T), T)
fi = firstindex(x)
li = lastindex(x)
@assert (firstindex(μ) == fi) & (lastindex(μ) == li)
len = li - (fi-1)
for i = 0:(len>>2)-1
_x0 = x[4i+fi]
_μ0 = μ[4i+fi]
_x1 = x[4i+fi+1]
_μ1 = μ[4i+fi+1]
_x2 = x[4i+fi+2]
_μ2 = μ[4i+fi+2]
_x3 = x[4i+fi+3]
_μ3 = μ[4i+fi+3]
@turbo for j ∈ eachindex(_x0)
s += (_x0[j] - _μ0[j])^2 + (_x1[j] - _μ1[j])^2 + (_x2[j] - _μ2[j])^2 + (_x3[j] - _μ3[j])^2
end
end
for i = fi+(len&-4):li
_x, _μ = x[i], μ[i]
@turbo for j ∈ eachindex(_x, _μ)
s += (_x[j] - _μ[j])^2
end
end
ℓ -= 0.5vsum(s) / σ^2
end
vector_of_vector_sum_v3 (generic function with 1 method)
julia> @btime vector_of_vector_sum($x, $μ, $σ, $n)
164.905 ns (0 allocations: 0 bytes)
-8672.369366928524
julia> @btime vector_of_vector_sum_v2($x, $μ, $σ, $n)
147.849 ns (0 allocations: 0 bytes)
-8672.369366928524
julia> @btime vector_of_vector_sum_v3($x, $μ, $σ, $n)
138.980 ns (0 allocations: 0 bytes)
-8672.36936692852
julia> @btime vector_sum($flat_x, $flat_μ, $σ, $n)
83.857 ns (0 allocations: 0 bytes)
-8672.369366928524
vector_sum requires many fewer synchronization steps.