Improving the performance of a sum over a vector of vectors

These help a little, but not much:

julia> function vector_of_vector_sum_v2(x, μ, σ, n)
           ℓ = -0.5n * log(2π * σ^2)
           s = zero(eltype(eltype(x)))
           fi = firstindex(x)
           li = lastindex(x)
           @assert (firstindex(μ) == fi) & (lastindex(μ) == li)
           len = li - (fi-1)
           for i = 0:(len>>2)-1
               _x0 = x[4i+fi]
               _μ0 = μ[4i+fi]
               _x1 = x[4i+fi+1]
               _μ1 = μ[4i+fi+1]
               _x2 = x[4i+fi+2]
               _μ2 = μ[4i+fi+2]
               _x3 = x[4i+fi+3]
               _μ3 = μ[4i+fi+3]
               @turbo for j ∈ eachindex(_x0)
                   s += (_x0[j] - _μ0[j])^2 + (_x1[j] - _μ1[j])^2 + (_x2[j] - _μ2[j])^2 + (_x3[j] - _μ3[j])^2
               end
           end
           for i = fi+(len&-4):li
               _x, _μ = x[i], μ[i]
               @turbo for j ∈ eachindex(_x, _μ)
                   s += (_x[j] - _μ[j])^2
               end
           end
           ℓ -= 0.5s / σ^2
       end
vector_of_vector_sum_v2 (generic function with 1 method)

julia> function vector_of_vector_sum_v3(x, μ, σ, n)
           ℓ = -0.5n * log(2π * σ^2)
           T = eltype(eltype(x))
           s = vzero(pick_vector_width(T), T)
           fi = firstindex(x)
           li = lastindex(x)
           @assert (firstindex(μ) == fi) & (lastindex(μ) == li)
           len = li - (fi-1)
           for i = 0:(len>>2)-1
               _x0 = x[4i+fi]
               _μ0 = μ[4i+fi]
               _x1 = x[4i+fi+1]
               _μ1 = μ[4i+fi+1]
               _x2 = x[4i+fi+2]
               _μ2 = μ[4i+fi+2]
               _x3 = x[4i+fi+3]
               _μ3 = μ[4i+fi+3]
               @turbo for j ∈ eachindex(_x0)
                   s += (_x0[j] - _μ0[j])^2 + (_x1[j] - _μ1[j])^2 + (_x2[j] - _μ2[j])^2 + (_x3[j] - _μ3[j])^2
               end
           end
           for i = fi+(len&-4):li
               _x, _μ = x[i], μ[i]
               @turbo for j ∈ eachindex(_x, _μ)
                   s += (_x[j] - _μ[j])^2
               end
           end
           ℓ -= 0.5vsum(s) / σ^2
       end
vector_of_vector_sum_v3 (generic function with 1 method)

julia> @btime vector_of_vector_sum($x, $μ, $σ, $n)
  164.905 ns (0 allocations: 0 bytes)
-8672.369366928524

julia> @btime vector_of_vector_sum_v2($x, $μ, $σ, $n)
  147.849 ns (0 allocations: 0 bytes)
-8672.369366928524

julia> @btime vector_of_vector_sum_v3($x, $μ, $σ, $n)
  138.980 ns (0 allocations: 0 bytes)
-8672.36936692852

julia> @btime vector_sum($flat_x, $flat_μ, $σ, $n)
  83.857 ns (0 allocations: 0 bytes)
-8672.369366928524

vector_sum requires many fewer synchronization steps.

4 Likes