I know I’m a broken record on LoopVectorization.jl, but I added it to your script. Results:

```
'Normal' arrays
f = function(...norm
646.984 ms (12625004 allocations: 1.31 GiB)
function gauss(... norm
641.349 ms (12500002 allocations: 1.30 GiB)
f = function(...dist
145.499 ms (125004 allocations: 2.86 MiB)
function gauss(...dist
141.914 ms (2 allocations: 976.64 KiB)
Loop Vectorization
18.552 ms (2 allocations: 976.64 KiB)
Static arrays
f = function(...norm
138.706 ms (250004 allocations: 6.68 MiB)
function gauss(... norm
135.619 ms (2 allocations: 976.64 KiB)
f = function(...dist
136.244 ms (250004 allocations: 6.68 MiB)
function gauss(...dist
134.148 ms (2 allocations: 976.64 KiB)
end
```

Over 7x faster than the next fastest version, which is about inline with what you’d expect for SIMD vs non-SIMD code on a computer with AVX512.

New script:

```
using LinearAlgebra
using BenchmarkTools
using StaticArrays
using LoopVectorization
dist(x,y) = sqrt((x[1]-y[1])^2+(x[2]-y[2])^2+(x[3]-y[3])^2)
old = function(x, c; B :: Float64 = -0.5, r :: Float64 = 1.0)
κ = B / r^2;
val = 0.0;
for i in 1:length(c)
val += exp(κ * ((norm(x - c[i])) - r^2))
end
return val
end
old_dist = function(x, c; B :: Float64 = -0.5, r :: Float64 = 1.0)
κ = B / r^2;
val = 0.0;
for i in 1:length(c)
val += exp(κ * (dist(x,c[i]) - r^2))
end
return val
end
function gaussian_kernel(x, c; B :: Float64 = -0.5, r :: Float64 = 1.0)
κ = B / r^2;
val = 0.0;
for i in 1:length(c)
val += exp(κ * ((norm(x - c[i])) - r^2))
end
return val
end
function gaussian_kernel_dist(x, c; B :: Float64 = -0.5, r :: Float64 = 1.0)
κ = B / r^2;
val = 0.0;
for i in 1:length(c)
val += exp(κ * (dist(x,c[i]) - r^2))
end
return val
end
function gaussian_kernel_loopvec(x, c; B :: Float64 = -0.5, r :: Float64 = 1.0)
κ = B / r^2;
val = 0.0;
@avx for i in axes(c,1)
d² = 0.0
for j ∈ 1:3
d² += abs2(x[j] - c[i,j])
end
val += exp(κ * (sqrt(d²) - r^2))
end
return val
end
grid_size = 50;
x_range = range(-2., 2.; length=grid_size);
y_range = range(-2., 2.; length=grid_size);
z_range = range(-2., 2.; length=grid_size);
println("'Normal' arrays")
grid = [collect(x) for x in Iterators.product(x_range, y_range, z_range)];
atoms = [randn(3) for i in 1:100];
atomsmat = reduce(vcat, adjoint.(atoms));
@assert map(x -> gaussian_kernel_dist(x, atoms; r=0.6), grid) ≈ map(x -> gaussian_kernel_loopvec(x, atomsmat; r=0.6), grid)
println(" f = function(...norm ")
@btime map(x -> old(x, $atoms; r=0.6), $grid);
println(" function gauss(... norm ")
@btime map(x -> gaussian_kernel(x, $atoms; r=0.6), $grid);
println(" f = function(...dist ")
@btime map(x -> old_dist(x, $atoms; r=0.6), $grid);
println(" function gauss(...dist ")
@btime map(x -> gaussian_kernel_dist(x, $atoms; r=0.6), $grid);
println("Loop Vectorization")
@btime map(x -> gaussian_kernel_loopvec(x, $atomsmat; r=0.6), $grid)
println("Static arrays")
atoms = [ @SVector randn(3) for i in 1:100];
grid = [ SVector(collect(x)...) for x in Iterators.product(x_range, y_range, z_range)];
println(" f = function(...norm ")
@btime map(x -> old(x, $atoms; r=0.6), $grid);
println(" function gauss(... norm ")
@btime map(x -> gaussian_kernel(x, $atoms; r=0.6), $grid);
println(" f = function(...dist ")
@btime map(x -> old_dist(x, $atoms; r=0.6), $grid);
println(" function gauss(...dist ")
@btime map(x -> gaussian_kernel_dist(x, $atoms; r=0.6), $grid);
println("end")
```