I am given a multi-dimensional Array
and have to multiply it by an SMatrix
along multiple dimensions. Using a straightforward approach in 2 dimensions, adding @avx
from the great package LoopVectorization.jl increases the performance as desired. However, doing the same in 3 dimensions, @avx
makes the code slower.
2D case
julia> using BenchmarkTools, LoopVectorization, StaticArrays
julia> function multiply_coordinatewise_squeezed!(
data_out::AbstractArray{T, 3}, data_in::AbstractArray{T, 3}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in = size(vandermonde, 2)
@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
inbounds || throw(BoundsError())
end
@inbounds for j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii]* vandermonde[j, jj] * data_in[v, ii, jj]
end
data_out[v, i, j] = acc
end
end
return data_out
end
julia> function multiply_coordinatewise_squeezed_avx!(
data_out::AbstractArray{T, 3}, data_in::AbstractArray{T, 3}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in = size(vandermonde, 2)
@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
inbounds || throw(BoundsError())
end
@avx for j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * data_in[v, ii, jj]
end
data_out[v, i, j] = acc
end
end
return data_out
end
julia> function run_benchmarks_2d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
data_in = randn(n_variables, n_nodes_in, n_nodes_in)
data_out = randn(n_variables, n_nodes_out, n_nodes_out)
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)
println("multiply_coordinatewise_squeezed!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
println()
println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
println()
println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
println()
println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
println()
nothing
end
julia> run_benchmarks_2d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 421.402 ns (0.00% GC)
median time: 422.090 ns (0.00% GC)
mean time: 425.764 ns (0.00% GC)
maximum time: 956.447 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 199
multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 738.446 ns (0.00% GC)
median time: 807.826 ns (0.00% GC)
mean time: 815.222 ns (0.00% GC)
maximum time: 1.771 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 92
multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 331.277 ns (0.00% GC)
median time: 331.781 ns (0.00% GC)
mean time: 335.048 ns (0.00% GC)
maximum time: 805.723 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 224
multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 98.490 ns (0.00% GC)
median time: 100.468 ns (0.00% GC)
mean time: 101.441 ns (0.00% GC)
maximum time: 213.163 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 946
3D case
julia> using BenchmarkTools, LoopVectorization, StaticArrays
julia> function multiply_coordinatewise_squeezed!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in = size(vandermonde, 2)
@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
inbounds || throw(BoundsError())
end
@inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
end
data_out[v, i, j, k] = acc
end
end
return data_out
end
julia> function multiply_coordinatewise_squeezed_avx!(
data_out::AbstractArray{T, 4}, data_in::AbstractArray{T, 4}, vandermonde, n_variables) where T
n_nodes_out = size(vandermonde, 1)
n_nodes_in = size(vandermonde, 2)
@boundscheck begin
inbounds = (size(data_out, 1) >= n_variables) &&
(size(data_out, 2) == size(data_out, 3) == size(data_out, 3) == n_nodes_out) &&
(size(data_in, 1) >= n_variables) &&
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
inbounds || throw(BoundsError())
end
@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
for v in 1:n_variables
acc = zero(eltype(data_out))
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
end
data_out[v, i, j, k] = acc
end
end
return data_out
end
julia> function run_benchmarks_3d(;n_variables=4, n_nodes_in=4, n_nodes_out=n_nodes_in)
data_in = randn(n_variables, n_nodes_in, n_nodes_in, n_nodes_in)
data_out = randn(n_variables, n_nodes_out, n_nodes_out, n_nodes_out)
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)
println("multiply_coordinatewise_squeezed!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
println()
println("multiply_coordinatewise_squeezed!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
println()
println("multiply_coordinatewise_squeezed_avx!, vandermonde_static")
display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static), $(n_variables)))
println()
println("multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic")
display(@benchmark multiply_coordinatewise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic), $(n_variables)))
println()
nothing
end
julia> run_benchmarks_3d()
multiply_coordinatewise_squeezed!, vandermonde_static
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 10.969 μs (0.00% GC)
median time: 10.972 μs (0.00% GC)
mean time: 11.084 μs (0.00% GC)
maximum time: 28.741 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
multiply_coordinatewise_squeezed!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 12.767 μs (0.00% GC)
median time: 12.794 μs (0.00% GC)
mean time: 13.513 μs (0.00% GC)
maximum time: 61.101 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
multiply_coordinatewise_squeezed_avx!, vandermonde_static
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 15.073 μs (0.00% GC)
median time: 15.763 μs (0.00% GC)
mean time: 16.819 μs (0.00% GC)
maximum time: 68.479 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
multiply_coordinatewise_squeezed_avx!, vandermonde_dynamic
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 955.130 ns (0.00% GC)
median time: 957.609 ns (0.00% GC)
mean time: 969.877 ns (0.00% GC)
maximum time: 2.853 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 23
- Did I do anything wrong in the 3D case with
vandermonde::SMatrix
? - Is there an explanation why
@avx
is significantly faster forvandermonde::Matrix
than forvandermonde::SMatrix
?
Edit: Compared vandermonde::Matrix
and vandermonde::SMatrix
after reading @mcabbot’s comment below