LoopVectorization.jl: adding `@avx` makes code slower

I am given a multi-dimensional Array and have to multiply it by an SMatrix along multiple dimensions. Using a straightforward approach in 2 dimensions, adding @avx from the great package LoopVectorization.jl increases the performance as desired. However, doing the same in 3 dimensions, @avx makes the code slower.

2D case
julia> using BenchmarkTools, LoopVectorization, StaticArrays

julia> function multiply_coordinatewise_squeezed!(
           data_out::AbstractArray{T, 3}, data_in::AbstractArray{T, 3}, vandermonde, n_variables) where T
         n_nodes_out = size(vandermonde, 1)
         n_nodes_in  = size(vandermonde, 2)

         @boundscheck begin
           inbounds = (size(data_out, 1) >= n_variables) &&
                      (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
                      (size(data_in, 1) >= n_variables) &&
                      (size(data_in, 2) == size(data_in, 3) == n_nodes_in)
           inbounds || throw(BoundsError())
         end

         @inbounds for j in 1:n_nodes_out, i in 1:n_nodes_out
           for v in 1:n_variables
             acc = zero(eltype(data_out))
             for jj in 1:n_nodes_in, ii in 1:n_nodes_in
               acc += vandermonde[i, ii]*  vandermonde[j, jj] * data_in[v, ii, jj]
             end
             data_out[v, i, j] = acc
           end
         end

         return data_out
       end

julia> function multiply_coordinatewise_squeezed_avx!(
           data_out::AbstractArray{T, 3}, data_in::AbstractArray{T, 3}, vandermonde, n_variables) where T
         n_nodes_out = size(vandermonde, 1)
         n_nodes_in  = size(vandermonde, 2)

         @boundscheck begin
           inbounds = (size(data_out, 1) >= n_variables) &&
                      (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
                      (size(data_in, 1) >= n_variables) &&
                      (size(data_in, 2) == size(data_in, 3) == n_nodes_in)
           inbounds || throw(BoundsError())
         end

         @avx for j in 1:n_nodes_out, i in 1:n_nodes_out
           for v in 1:n_variables
             acc = zero(eltype(data_out))
             for jj in 1:n_nodes_in, ii in 1:n_nodes_in
               acc += vandermonde[i, ii] * vandermonde[j, jj] * data_in[v, ii, jj]
             end
             data_out[v, i, j] = acc
           end
         end

         return data_out
       end

julia> function run_benchmarks_2d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
         data_in  = randn(n_variables, n_nodes_in,  n_nodes_in)
         data_out = randn(n_variables, n_nodes_out, n_nodes_out)
         vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
         vandermonde_static  = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)

         println("multiply_coordinatewise_squeezed!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
         println()
         
         println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
         display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
         println()

         println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
         println()

         println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
         display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
         println()

         nothing
       end

julia> run_benchmarks_2d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     421.402 ns (0.00% GC)
  median time:      422.090 ns (0.00% GC)
  mean time:        425.764 ns (0.00% GC)
  maximum time:     956.447 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     199

multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     738.446 ns (0.00% GC)
  median time:      807.826 ns (0.00% GC)
  mean time:        815.222 ns (0.00% GC)
  maximum time:     1.771 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     92

multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     331.277 ns (0.00% GC)
  median time:      331.781 ns (0.00% GC)
  mean time:        335.048 ns (0.00% GC)
  maximum time:     805.723 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     224

multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     98.490 ns (0.00% GC)
  median time:      100.468 ns (0.00% GC)
  mean time:        101.441 ns (0.00% GC)
  maximum time:     213.163 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     946
3D case
julia> using BenchmarkTools, LoopVectorization, StaticArrays

julia> function multiply_coordinatewise_squeezed!(
           data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
         n_nodes_out = size(vandermonde, 1)
         n_nodes_in  = size(vandermonde, 2)

         @boundscheck begin
           inbounds = (size(data_out, 1) >= n_variables) &&
                      (size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
                      (size(data_in, 1) >= n_variables) &&
                      (size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
           inbounds || throw(BoundsError())
         end

         @inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
           for v in 1:n_variables
             acc = zero(eltype(data_out))
             for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
               acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
             end
             data_out[v, i, j, k] = acc
           end
         end

         return data_out
       end

julia> function multiply_coordinatewise_squeezed_avx!(
           data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
         n_nodes_out = size(vandermonde, 1)
         n_nodes_in  = size(vandermonde, 2)

         @boundscheck begin
           inbounds = (size(data_out, 1) >= n_variables) &&
                      (size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
                      (size(data_in, 1) >= n_variables) &&
                      (size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
           inbounds || throw(BoundsError())
         end

         @avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
           for v in 1:n_variables
             acc = zero(eltype(data_out))
             for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
               acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
             end
             data_out[v, i, j, k] = acc
           end
         end

         return data_out
       end

julia> function run_benchmarks_3d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
         data_in  = randn(n_variables, n_nodes_in,  n_nodes_in,  n_nodes_in)
         data_out = randn(n_variables, n_nodes_out, n_nodes_out, n_nodes_out)
         vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
         vandermonde_static  = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)

         println("multiply_coordinatewise_squeezed!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
         println()
         
         println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
         display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
         println()

         println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
         println()

         println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
         display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
         println()

         nothing
       end

julia> run_benchmarks_3d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     10.969 μs (0.00% GC)
  median time:      10.972 μs (0.00% GC)
  mean time:        11.084 μs (0.00% GC)
  maximum time:     28.741 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     12.767 μs (0.00% GC)
  median time:      12.794 μs (0.00% GC)
  mean time:        13.513 μs (0.00% GC)
  maximum time:     61.101 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     15.073 μs (0.00% GC)
  median time:      15.763 μs (0.00% GC)
  mean time:        16.819 μs (0.00% GC)
  maximum time:     68.479 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     955.130 ns (0.00% GC)
  median time:      957.609 ns (0.00% GC)
  mean time:        969.877 ns (0.00% GC)
  maximum time:     2.853 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     23
  1. Did I do anything wrong in the 3D case with vandermonde::SMatrix?
  2. Is there an explanation why @avx is significantly faster for vandermonde::Matrix than for vandermonde::SMatrix?

Edit: Compared vandermonde::Matrix and vandermonde::SMatrix after reading @mcabbot’s comment below

I don’t see obvious mistakes, although I’m not sure staticarrays help here. But there’s an algorithmic opportunity: it’s much more efficient to do repeated matrix multiplication in series, than to write one big set of loops. Here’s my quick comparison:

using LoopVectorization, Tullio, BenchmarkTools, Test

# Functions as in the question, more compactly:
loops!(out, vand, data::Array{<:Any,3}) = @tullio out[v, i, j] = vand[i, ii] * vand[j, jj] * data[v, ii, jj] threads=false
loops!(out, vand, data::Array{<:Any,4}) = @tullio out[v, i, j, k] = vand[i, ii] * vand[j, jj] * vand[k, kk] * data[v, ii, jj, kk] threads=false

N = 4; vand = rand(N,N); data3 = randn(N,N,N); data4 = randn(N,N,N,N);
out3 = similar(data3); out4 = similar(data4);

@btime loops!($out3, $vand, $data3); #   136.288 ns [ 1.192 μs without @avx]
@btime loops!($out4, $vand, $data4); # 1.281 μs     [19.139 μs without]

# Matrix multiplication in series instead:
function series!(out, vand, data::Array{<:Any,3}, cache=similar(out))
    @tullio cache[v, ii, j] = vand[j, jj] * data[v, ii, jj] threads=false
    @tullio out[v, i, j] = vand[i, ii] * cache[v, ii, j] threads=false
end
function series!(out, vand, data::Array{<:Any,4}, cache=similar(out))
    @tullio out[v, ii, jj, k] = vand[k, kk] * data[v, ii, jj, kk] threads=false
    @tullio cache[v, ii, j, k] = vand[j, jj] * out[v, ii, jj, k] threads=false
    @tullio out[v, i, j, k] = vand[i, ii] * cache[v, ii, j, k] threads=false
end

@test loops!(similar(out3), vand, data3) ≈ series!(similar(out3), vand, data3)
@test loops!(similar(out4), vand, data4) ≈ series!(similar(out4), vand, data4)

@btime series!($out3, $vand, $data3); # 128.236 ns (1 allocation: 624 bytes)
@btime series!($out4, $vand, $data4); # 593.971 ns (2 allocations: 2.17 KiB)

# and if you provide the buffer:
@btime series!($out3, $vand, $data3, $(similar(out3))); #  79.198 ns
@btime series!($out4, $vand, $data4, $(similar(out4))); # 316.962 ns (0 allocations) [3.506 μs without @avx]

This issue has discussion of a similar problem with 4 multiplications, in which there was quite a lot of room to do even better. (Edit – reverted to a version with intermediate cache.)

Thanks a lot, @mcabbott! I know about the possibility to re-write the code using multiple matrix multiplications sequentially but didn’t want to clutter this question with another question on optimal implementations of that approach - but you’ve already seen that and provided a quite nice one, thanks! I will look through that issue and try to implement a version using Base.Cartesian.@nexprs. The only difficulty seems to be the my bounds are encoded as type parameters of the given StaticArray, but I hope to circumvent that by using @generated functions. Or is there a better approach?

OK, I should have just pasted the link I guess!

Having sizes in the type parameters is surely ideal for @nexprs. But whether this is faster, you will have to try I suppose. On my comptuer, today:

julia> @btime rotation!($Crot, $Q, $C, $mid)
  455.543 ns (0 allocations: 0 bytes)

julia> @btime rotation_avx!($Crot, $Q, $C, $mid);
  136.706 ns (0 allocations: 0 bytes)

julia> @btime rotation_nexpr!($Crot, $Q, $C);
  180.331 ns (0 allocations: 0 bytes)

julia> QS = @SMatrix rand(3,3);

julia> @btime rotation!($Crot, $QS, $C, $mid);
  377.343 ns (0 allocations: 0 bytes)

julia> @btime rotation_avx!($Crot, $QS, $C, $mid); # LoopVectorization.check_args(QS) == false
  377.142 ns (0 allocations: 0 bytes)

julia> @btime rotation_nexpr!($Crot, $QS, $C);
  167.569 ns (0 allocations: 0 bytes)

Thanks again, @mcabbott! I’ve added a comparison of Matrix and SMatrix to my first post. Do you have an idea why @avx is significantly faster for Matrix than for SMatrix?

I don’t think @avx does anything for StaticArrays – check_args is how it decides whether to run a fallback version, which is just loops, instead of its own vectorised things. On your tests this seems to be slower than your loops, which is odd.

Okay, thanks!

I've also tested an implementation for the 3D case based on `Base.Cartesian.@nexpr` - that one can be significantly faster, even faster than `@avx` for a sequential implementation.
julia> using BenchmarkTools, LoopVectorization, StaticArrays

julia> function multiply_coordinatewise_squeezed!(
           data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
         n_nodes_out = size(vandermonde, 1)
         n_nodes_in  = size(vandermonde, 2)

         @boundscheck begin
           inbounds = (size(data_out, 1) >= n_variables) &&
                      (size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
                      (size(data_in, 1) >= n_variables) &&
                      (size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
           inbounds || throw(BoundsError())
         end

         @inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
           for v in 1:n_variables
             acc = zero(eltype(data_out))
             for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
               acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
             end
             data_out[v, i, j, k] = acc
           end
         end

         return data_out
       end

julia> function multiply_coordinatewise_squeezed_avx!(
           data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
         n_nodes_out = size(vandermonde, 1)
         n_nodes_in  = size(vandermonde, 2)

         @boundscheck begin
           inbounds = (size(data_out, 1) >= n_variables) &&
                      (size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
                      (size(data_in, 1) >= n_variables) &&
                      (size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
           inbounds || throw(BoundsError())
         end

         @avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
           for v in 1:n_variables
             acc = zero(eltype(data_out))
             for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
               acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
             end
             data_out[v, i, j, k] = acc
           end
         end

         return data_out
       end

julia> function multiply_coordinatewise_sequential_avx!(
           data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables;
           tmp1=zeros(eltype(data_out), n_variables, size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),
           tmp2=zeros(eltype(data_out), n_variables, size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2))) where T
         n_nodes_out = size(vandermonde, 1)
         n_nodes_in  = size(vandermonde, 2)
         data_out .= zero(eltype(data_out))

         @boundscheck begin
           inbounds = (size(data_out, 1) >= n_variables) &&
                      (size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
                      (size(data_in, 1)  >= n_variables) &&
                      (size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
           inbounds || throw(BoundsError())
         end

         # Interpolate in x-direction
         # tmp1 = zeros(eltype(data_out), n_variables, n_nodes_out, n_nodes_in, n_nodes_in)
         @avx for k in 1:n_nodes_in, j in 1:n_nodes_in, i in 1:n_nodes_out
           for ii in 1:n_nodes_in
             for v in 1:n_variables
               tmp1[v, i, j, k] += vandermonde[i, ii] * data_in[v, ii, j, k]
             end
           end
         end

         # Interpolate in y-direction
         # tmp2 = zeros(eltype(data_out), n_variables, n_nodes_out, n_nodes_out, n_nodes_in)
         @avx for k in 1:n_nodes_in, j in 1:n_nodes_out, i in 1:n_nodes_out
           for jj in 1:n_nodes_in
             for v in 1:n_variables
               tmp2[v, i, j, k] += vandermonde[j, jj] * tmp1[v, i, jj, k]
             end
           end
         end

         # Interpolate in z-direction
         @avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
           for kk in 1:n_nodes_in
             for v in 1:n_variables
               data_out[v, i, j, k] += vandermonde[k, kk] * tmp2[v, i, j, kk]
             end
           end
         end

         return data_out
       end

julia> @generated function multiply_coordinatewise_sequential_nexpr!(
           data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde::SMatrix{n_nodes_out,n_nodes_in}, ::Val{n_variables}) where {T,n_nodes_out,n_nodes_in,n_variables}
         quote
           @boundscheck begin
             inbounds = (size(data_out, 1) >= $n_variables) &&
                       (size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == $n_nodes_out) &&
                       (size(data_in, 1) >= $n_variables) &&
                       (size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == $n_nodes_in)
             inbounds || throw(BoundsError())
           end

           # Interpolate in x-direction
           # tmp1 = zeros(eltype(data_out), n_variables, $n_nodes_out, $n_nodes_in, $n_nodes_in)
           @inbounds Base.Cartesian.@nexprs $n_nodes_in k -> begin
             Base.Cartesian.@nexprs $n_nodes_in j -> begin
               Base.Cartesian.@nexprs $n_nodes_out i -> begin
                 Base.Cartesian.@nexprs $n_variables v -> begin
                   tmp1_v_i_j_k = zero(eltype(data_out))
                   Base.Cartesian.@nexprs $n_nodes_in ii -> begin
                     tmp1_v_i_j_k += vandermonde[i, ii] * data_in[v, ii, j, k]
                   end
                 end
               end
             end
           end

           # Interpolate in y-direction
           # tmp2 = zeros(eltype(data_out), n_variables, $n_nodes_out, $n_nodes_out, $n_nodes_in)
           @inbounds Base.Cartesian.@nexprs $n_nodes_in k -> begin
             Base.Cartesian.@nexprs $n_nodes_out j -> begin
               Base.Cartesian.@nexprs $n_nodes_out i -> begin
                 Base.Cartesian.@nexprs $n_variables v -> begin
                   tmp2_v_i_j_k = zero(eltype(data_out))
                   Base.Cartesian.@nexprs $n_nodes_in jj -> begin
                     tmp2_v_i_j_k += vandermonde[j, jj] * tmp1_v_i_jj_k
                   end
                 end
               end
             end
           end

           # Interpolate in z-direction
           @inbounds Base.Cartesian.@nexprs $n_nodes_out k -> begin
             Base.Cartesian.@nexprs $n_nodes_out j -> begin
               Base.Cartesian.@nexprs $n_nodes_out i -> begin
                 Base.Cartesian.@nexprs $n_variables v -> begin
                   tmp3_v_i_j_k = zero(eltype(data_out))
                   Base.Cartesian.@nexprs $n_nodes_in kk -> begin
                     tmp3_v_i_j_k += vandermonde[k, kk] * tmp2_v_i_j_kk
                   end
                   data_out[v, i, j, k] = tmp3_v_i_j_k
                 end
               end
             end
           end

           return data_out
         end
       end

julia> function run_benchmarks_3d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
         data_in  = randn(n_variables, n_nodes_in,  n_nodes_in,  n_nodes_in)
         data_out = randn(n_variables, n_nodes_out, n_nodes_out, n_nodes_out)
         vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
         vandermonde_static  = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)

         println("multiply_coordinatewise_squeezed!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
         println()

         println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
         display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
         println()

         println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
         println()

         println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
         display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
         println()

         println("multiply_coordinatewise_sequential_avx!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_sequential_avx!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
         println()

         println("multiply_coordinatewise_sequential_avx!, vandermonde_dynamic")
         display(@benchmark multiply_coordinatewise_sequential_avx!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
         println()

         println("multiply_coordinatewise_sequential_nexpr!, vandermonde_static")
         display(@benchmark multiply_coordinatewise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_variables))))
         println()

         nothing
       end

julia> run_benchmarks_3d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     10.938 μs (0.00% GC)
  median time:      11.176 μs (0.00% GC)
  mean time:        11.980 μs (0.00% GC)
  maximum time:     179.530 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     12.776 μs (0.00% GC)
  median time:      12.793 μs (0.00% GC)
  mean time:        12.960 μs (0.00% GC)
  maximum time:     43.239 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     15.056 μs (0.00% GC)
  median time:      15.419 μs (0.00% GC)
  mean time:        16.094 μs (0.00% GC)
  maximum time:     51.966 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     952.609 ns (0.00% GC)
  median time:      995.391 ns (0.00% GC)
  mean time:        1.057 μs (0.00% GC)
  maximum time:     3.710 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     23

multiply_coordinatewise_sequential_avx!, vandermonde_static
BenchmarkTools.Trial:
  memory estimate:  4.34 KiB
  allocs estimate:  4
  --------------
  minimum time:     3.023 μs (0.00% GC)
  median time:      3.180 μs (0.00% GC)
  mean time:        3.447 μs (4.28% GC)
  maximum time:     303.146 μs (86.07% GC)
  --------------
  samples:          10000
  evals/sample:     8

multiply_coordinatewise_sequential_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
  memory estimate:  4.34 KiB
  allocs estimate:  4
  --------------
  minimum time:     746.430 ns (0.00% GC)
  median time:      823.140 ns (0.00% GC)
  mean time:        1.124 μs (17.40% GC)
  maximum time:     41.617 μs (83.04% GC)
  --------------
  samples:          10000
  evals/sample:     107

multiply_coordinatewise_sequential_nexpr!, vandermonde_static
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     363.212 ns (0.00% GC)
  median time:      364.120 ns (0.00% GC)
  mean time:        368.505 ns (0.00% GC)
  maximum time:     772.255 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     208

This is still slower for me, curiously:

vandS = SMatrix{N,N}(vand);
multiply_coordinatewise_sequential_nexpr!(out4, data4, vandS, Val(4));
@test out4 ≈ series!(similar(out4), vand, data4)

@btime multiply_coordinatewise_sequential_nexpr!($out4, $data4, $vandS, Val(4)); # 489.523 ns (0 allocations)

Compared to 316.962 ns above (or 366.126 ns with your multiply_coordinatewise_sequential_avx!). But does avoid needing a buffer. And surely depends on details of your computer, and also on the size of the matrices of course.

What happens if you interpolate the Val(4)? I.e. if you use

@btime multiply_coordinatewise_sequential_nexpr!($out4, $data4, $vandS, $(Val(4)));

instead of

@btime multiply_coordinatewise_sequential_nexpr!($out4, $data4, $vandS, Val(4));