# LoopVectorization.jl: adding `@avx` makes code slower

I am given a multi-dimensional `Array` and have to multiply it by an `SMatrix` along multiple dimensions. Using a straightforward approach in 2 dimensions, adding `@avx` from the great package LoopVectorization.jl increases the performance as desired. However, doing the same in 3 dimensions, `@avx` makes the code slower.

2D case
``````julia> using BenchmarkTools, LoopVectorization, StaticArrays

julia> function multiply_coordinatewise_squeezed!(
data_out::AbstractArray{T, 3}, data_in::AbstractArray{T, 3}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in  = size(vandermonde, 2)

@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
inbounds || throw(BoundsError())
end

@inbounds for j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii]*  vandermonde[j, jj] * data_in[v, ii, jj]
end
data_out[v, i, j] = acc
end
end

return data_out
end

julia> function multiply_coordinatewise_squeezed_avx!(
data_out::AbstractArray{T, 3}, data_in::AbstractArray{T, 3}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in  = size(vandermonde, 2)

@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
inbounds || throw(BoundsError())
end

@avx for j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * data_in[v, ii, jj]
end
data_out[v, i, j] = acc
end
end

return data_out
end

julia> function run_benchmarks_2d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
data_in  = randn(n_variables, n_nodes_in,  n_nodes_in)
data_out = randn(n_variables, n_nodes_out, n_nodes_out)
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
vandermonde_static  = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)

println("multiply_coordinatewise_squeezed!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed!(\$(data_out), \$(data_in), \$(vandermonde_dynamic), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed_avx!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed_avx!(\$(data_out), \$(data_in), \$(vandermonde_dynamic), \$(n_variables)))
println()

nothing
end

julia> run_benchmarks_2d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     421.402 ns (0.00% GC)
median time:      422.090 ns (0.00% GC)
mean time:        425.764 ns (0.00% GC)
maximum time:     956.447 ns (0.00% GC)
--------------
samples:          10000
evals/sample:     199

multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     738.446 ns (0.00% GC)
median time:      807.826 ns (0.00% GC)
mean time:        815.222 ns (0.00% GC)
maximum time:     1.771 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     92

multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     331.277 ns (0.00% GC)
median time:      331.781 ns (0.00% GC)
mean time:        335.048 ns (0.00% GC)
maximum time:     805.723 ns (0.00% GC)
--------------
samples:          10000
evals/sample:     224

multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     98.490 ns (0.00% GC)
median time:      100.468 ns (0.00% GC)
mean time:        101.441 ns (0.00% GC)
maximum time:     213.163 ns (0.00% GC)
--------------
samples:          10000
evals/sample:     946
``````
3D case
``````julia> using BenchmarkTools, LoopVectorization, StaticArrays

julia> function multiply_coordinatewise_squeezed!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in  = size(vandermonde, 2)

@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
inbounds || throw(BoundsError())
end

@inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
end
data_out[v, i, j, k] = acc
end
end

return data_out
end

julia> function multiply_coordinatewise_squeezed_avx!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in  = size(vandermonde, 2)

@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
inbounds || throw(BoundsError())
end

@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
end
data_out[v, i, j, k] = acc
end
end

return data_out
end

julia> function run_benchmarks_3d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
data_in  = randn(n_variables, n_nodes_in,  n_nodes_in,  n_nodes_in)
data_out = randn(n_variables, n_nodes_out, n_nodes_out, n_nodes_out)
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
vandermonde_static  = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)

println("multiply_coordinatewise_squeezed!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed!(\$(data_out), \$(data_in), \$(vandermonde_dynamic), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed_avx!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed_avx!(\$(data_out), \$(data_in), \$(vandermonde_dynamic), \$(n_variables)))
println()

nothing
end

julia> run_benchmarks_3d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     10.969 μs (0.00% GC)
median time:      10.972 μs (0.00% GC)
mean time:        11.084 μs (0.00% GC)
maximum time:     28.741 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     12.767 μs (0.00% GC)
median time:      12.794 μs (0.00% GC)
mean time:        13.513 μs (0.00% GC)
maximum time:     61.101 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     15.073 μs (0.00% GC)
median time:      15.763 μs (0.00% GC)
mean time:        16.819 μs (0.00% GC)
maximum time:     68.479 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     955.130 ns (0.00% GC)
median time:      957.609 ns (0.00% GC)
mean time:        969.877 ns (0.00% GC)
maximum time:     2.853 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     23
``````
1. Did I do anything wrong in the 3D case with `vandermonde::SMatrix`?
2. Is there an explanation why `@avx` is significantly faster for `vandermonde::Matrix` than for `vandermonde::SMatrix`?

Edit: Compared `vandermonde::Matrix` and `vandermonde::SMatrix` after reading @mcabbot’s comment below

I don’t see obvious mistakes, although I’m not sure staticarrays help here. But there’s an algorithmic opportunity: it’s much more efficient to do repeated matrix multiplication in series, than to write one big set of loops. Here’s my quick comparison:

``````using LoopVectorization, Tullio, BenchmarkTools, Test

# Functions as in the question, more compactly:
loops!(out, vand, data::Array{<:Any,3}) = @tullio out[v, i, j] = vand[i, ii] * vand[j, jj] * data[v, ii, jj] threads=false
loops!(out, vand, data::Array{<:Any,4}) = @tullio out[v, i, j, k] = vand[i, ii] * vand[j, jj] * vand[k, kk] * data[v, ii, jj, kk] threads=false

N = 4; vand = rand(N,N); data3 = randn(N,N,N); data4 = randn(N,N,N,N);
out3 = similar(data3); out4 = similar(data4);

@btime loops!(\$out3, \$vand, \$data3); #   136.288 ns [ 1.192 μs without @avx]
@btime loops!(\$out4, \$vand, \$data4); # 1.281 μs     [19.139 μs without]

# Matrix multiplication in series instead:
function series!(out, vand, data::Array{<:Any,3}, cache=similar(out))
@tullio cache[v, ii, j] = vand[j, jj] * data[v, ii, jj] threads=false
@tullio out[v, i, j] = vand[i, ii] * cache[v, ii, j] threads=false
end
function series!(out, vand, data::Array{<:Any,4}, cache=similar(out))
@tullio out[v, ii, jj, k] = vand[k, kk] * data[v, ii, jj, kk] threads=false
@tullio cache[v, ii, j, k] = vand[j, jj] * out[v, ii, jj, k] threads=false
@tullio out[v, i, j, k] = vand[i, ii] * cache[v, ii, j, k] threads=false
end

@test loops!(similar(out3), vand, data3) ≈ series!(similar(out3), vand, data3)
@test loops!(similar(out4), vand, data4) ≈ series!(similar(out4), vand, data4)

@btime series!(\$out3, \$vand, \$data3); # 128.236 ns (1 allocation: 624 bytes)
@btime series!(\$out4, \$vand, \$data4); # 593.971 ns (2 allocations: 2.17 KiB)

# and if you provide the buffer:
@btime series!(\$out3, \$vand, \$data3, \$(similar(out3))); #  79.198 ns
@btime series!(\$out4, \$vand, \$data4, \$(similar(out4))); # 316.962 ns (0 allocations) [3.506 μs without @avx]
``````

This issue has discussion of a similar problem with 4 multiplications, in which there was quite a lot of room to do even better. (Edit – reverted to a version with intermediate `cache`.)

Thanks a lot, @mcabbott! I know about the possibility to re-write the code using multiple matrix multiplications sequentially but didn’t want to clutter this question with another question on optimal implementations of that approach - but you’ve already seen that and provided a quite nice one, thanks! I will look through that issue and try to implement a version using ` Base.Cartesian.@nexprs`. The only difficulty seems to be the my bounds are encoded as type parameters of the given `StaticArray`, but I hope to circumvent that by using `@generated` functions. Or is there a better approach?

OK, I should have just pasted the link I guess!

Having sizes in the type parameters is surely ideal for `@nexprs`. But whether this is faster, you will have to try I suppose. On my comptuer, today:

``````julia> @btime rotation!(\$Crot, \$Q, \$C, \$mid)
455.543 ns (0 allocations: 0 bytes)

julia> @btime rotation_avx!(\$Crot, \$Q, \$C, \$mid);
136.706 ns (0 allocations: 0 bytes)

julia> @btime rotation_nexpr!(\$Crot, \$Q, \$C);
180.331 ns (0 allocations: 0 bytes)

julia> QS = @SMatrix rand(3,3);

julia> @btime rotation!(\$Crot, \$QS, \$C, \$mid);
377.343 ns (0 allocations: 0 bytes)

julia> @btime rotation_avx!(\$Crot, \$QS, \$C, \$mid); # LoopVectorization.check_args(QS) == false
377.142 ns (0 allocations: 0 bytes)

julia> @btime rotation_nexpr!(\$Crot, \$QS, \$C);
167.569 ns (0 allocations: 0 bytes)
``````

Thanks again, @mcabbott! I’ve added a comparison of `Matrix` and `SMatrix` to my first post. Do you have an idea why `@avx` is significantly faster for `Matrix` than for `SMatrix`?

I don’t think `@avx` does anything for StaticArrays – `check_args` is how it decides whether to run a fallback version, which is just loops, instead of its own vectorised things. On your tests this seems to be slower than your loops, which is odd.

Okay, thanks!

I've also tested an implementation for the 3D case based on `Base.Cartesian.@nexpr` - that one can be significantly faster, even faster than `@avx` for a sequential implementation.
``````julia> using BenchmarkTools, LoopVectorization, StaticArrays

julia> function multiply_coordinatewise_squeezed!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in  = size(vandermonde, 2)

@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
inbounds || throw(BoundsError())
end

@inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
end
data_out[v, i, j, k] = acc
end
end

return data_out
end

julia> function multiply_coordinatewise_squeezed_avx!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in  = size(vandermonde, 2)

@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
inbounds || throw(BoundsError())
end

@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
end
data_out[v, i, j, k] = acc
end
end

return data_out
end

julia> function multiply_coordinatewise_sequential_avx!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables;
tmp1=zeros(eltype(data_out), n_variables, size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),
tmp2=zeros(eltype(data_out), n_variables, size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2))) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in  = size(vandermonde, 2)
data_out .= zero(eltype(data_out))

@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1)  >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
inbounds || throw(BoundsError())
end

# Interpolate in x-direction
# tmp1 = zeros(eltype(data_out), n_variables, n_nodes_out, n_nodes_in, n_nodes_in)
@avx for k in 1:n_nodes_in, j in 1:n_nodes_in, i in 1:n_nodes_out
for ii in 1:n_nodes_in
for v in 1:n_variables
tmp1[v, i, j, k] += vandermonde[i, ii] * data_in[v, ii, j, k]
end
end
end

# Interpolate in y-direction
# tmp2 = zeros(eltype(data_out), n_variables, n_nodes_out, n_nodes_out, n_nodes_in)
@avx for k in 1:n_nodes_in, j in 1:n_nodes_out, i in 1:n_nodes_out
for jj in 1:n_nodes_in
for v in 1:n_variables
tmp2[v, i, j, k] += vandermonde[j, jj] * tmp1[v, i, jj, k]
end
end
end

# Interpolate in z-direction
@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
for kk in 1:n_nodes_in
for v in 1:n_variables
data_out[v, i, j, k] += vandermonde[k, kk] * tmp2[v, i, j, kk]
end
end
end

return data_out
end

julia> @generated function multiply_coordinatewise_sequential_nexpr!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde::SMatrix{n_nodes_out,n_nodes_in}, ::Val{n_variables}) where {T,n_nodes_out,n_nodes_in,n_variables}
quote
@boundscheck begin
inbounds = (size(data_out, 1) >= \$n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == \$n_nodes_out) &&
(size(data_in, 1) >= \$n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == \$n_nodes_in)
inbounds || throw(BoundsError())
end

# Interpolate in x-direction
# tmp1 = zeros(eltype(data_out), n_variables, \$n_nodes_out, \$n_nodes_in, \$n_nodes_in)
@inbounds Base.Cartesian.@nexprs \$n_nodes_in k -> begin
Base.Cartesian.@nexprs \$n_nodes_in j -> begin
Base.Cartesian.@nexprs \$n_nodes_out i -> begin
Base.Cartesian.@nexprs \$n_variables v -> begin
tmp1_v_i_j_k = zero(eltype(data_out))
Base.Cartesian.@nexprs \$n_nodes_in ii -> begin
tmp1_v_i_j_k += vandermonde[i, ii] * data_in[v, ii, j, k]
end
end
end
end
end

# Interpolate in y-direction
# tmp2 = zeros(eltype(data_out), n_variables, \$n_nodes_out, \$n_nodes_out, \$n_nodes_in)
@inbounds Base.Cartesian.@nexprs \$n_nodes_in k -> begin
Base.Cartesian.@nexprs \$n_nodes_out j -> begin
Base.Cartesian.@nexprs \$n_nodes_out i -> begin
Base.Cartesian.@nexprs \$n_variables v -> begin
tmp2_v_i_j_k = zero(eltype(data_out))
Base.Cartesian.@nexprs \$n_nodes_in jj -> begin
tmp2_v_i_j_k += vandermonde[j, jj] * tmp1_v_i_jj_k
end
end
end
end
end

# Interpolate in z-direction
@inbounds Base.Cartesian.@nexprs \$n_nodes_out k -> begin
Base.Cartesian.@nexprs \$n_nodes_out j -> begin
Base.Cartesian.@nexprs \$n_nodes_out i -> begin
Base.Cartesian.@nexprs \$n_variables v -> begin
tmp3_v_i_j_k = zero(eltype(data_out))
Base.Cartesian.@nexprs \$n_nodes_in kk -> begin
tmp3_v_i_j_k += vandermonde[k, kk] * tmp2_v_i_j_kk
end
data_out[v, i, j, k] = tmp3_v_i_j_k
end
end
end
end

return data_out
end
end

julia> function run_benchmarks_3d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
data_in  = randn(n_variables, n_nodes_in,  n_nodes_in,  n_nodes_in)
data_out = randn(n_variables, n_nodes_out, n_nodes_out, n_nodes_out)
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
vandermonde_static  = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)

println("multiply_coordinatewise_squeezed!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed!(\$(data_out), \$(data_in), \$(vandermonde_dynamic), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed_avx!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(n_variables)))
println()

println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed_avx!(\$(data_out), \$(data_in), \$(vandermonde_dynamic), \$(n_variables)))
println()

println("multiply_coordinatewise_sequential_avx!, vandermonde_static")
display(@benchmark multiply_coordinatewise_sequential_avx!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(n_variables)))
println()

println("multiply_coordinatewise_sequential_avx!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_sequential_avx!(\$(data_out), \$(data_in), \$(vandermonde_dynamic), \$(n_variables)))
println()

println("multiply_coordinatewise_sequential_nexpr!, vandermonde_static")
display(@benchmark multiply_coordinatewise_sequential_nexpr!(\$(data_out), \$(data_in), \$(vandermonde_static), \$(Val(n_variables))))
println()

nothing
end

julia> run_benchmarks_3d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     10.938 μs (0.00% GC)
median time:      11.176 μs (0.00% GC)
mean time:        11.980 μs (0.00% GC)
maximum time:     179.530 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     12.776 μs (0.00% GC)
median time:      12.793 μs (0.00% GC)
mean time:        12.960 μs (0.00% GC)
maximum time:     43.239 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     15.056 μs (0.00% GC)
median time:      15.419 μs (0.00% GC)
mean time:        16.094 μs (0.00% GC)
maximum time:     51.966 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     952.609 ns (0.00% GC)
median time:      995.391 ns (0.00% GC)
mean time:        1.057 μs (0.00% GC)
maximum time:     3.710 μs (0.00% GC)
--------------
samples:          10000
evals/sample:     23

multiply_coordinatewise_sequential_avx!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  4.34 KiB
allocs estimate:  4
--------------
minimum time:     3.023 μs (0.00% GC)
median time:      3.180 μs (0.00% GC)
mean time:        3.447 μs (4.28% GC)
maximum time:     303.146 μs (86.07% GC)
--------------
samples:          10000
evals/sample:     8

multiply_coordinatewise_sequential_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate:  4.34 KiB
allocs estimate:  4
--------------
minimum time:     746.430 ns (0.00% GC)
median time:      823.140 ns (0.00% GC)
mean time:        1.124 μs (17.40% GC)
maximum time:     41.617 μs (83.04% GC)
--------------
samples:          10000
evals/sample:     107

multiply_coordinatewise_sequential_nexpr!, vandermonde_static
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     363.212 ns (0.00% GC)
median time:      364.120 ns (0.00% GC)
mean time:        368.505 ns (0.00% GC)
maximum time:     772.255 ns (0.00% GC)
--------------
samples:          10000
evals/sample:     208
``````

This is still slower for me, curiously:

``````vandS = SMatrix{N,N}(vand);
multiply_coordinatewise_sequential_nexpr!(out4, data4, vandS, Val(4));
@test out4 ≈ series!(similar(out4), vand, data4)

@btime multiply_coordinatewise_sequential_nexpr!(\$out4, \$data4, \$vandS, Val(4)); # 489.523 ns (0 allocations)
``````

Compared to 316.962 ns above (or 366.126 ns with your `multiply_coordinatewise_sequential_avx!`). But does avoid needing a buffer. And surely depends on details of your computer, and also on the size of the matrices of course.

What happens if you interpolate the `Val(4)`? I.e. if you use

``````@btime multiply_coordinatewise_sequential_nexpr!(\$out4, \$data4, \$vandS, \$(Val(4)));
``````

``````@btime multiply_coordinatewise_sequential_nexpr!(\$out4, \$data4, \$vandS, Val(4));