This looks great! You may want to harness some of @elrod’s work in LoopVectorization:
using StaticKernels, LoopVectorization, Images, OffsetArrays, BenchmarkTools
function filter2davx!(out::AbstractMatrix, A::AbstractMatrix, kern)
@avx for J in CartesianIndices(out)
tmp = zero(eltype(out))
for I ∈ CartesianIndices(kern)
tmp += A[I + J] * kern[I]
end
out[J] = tmp
end
out
end
# StaticKernels
k = StaticKernels.Kernel{(-1:1,-1:1)}(w -> w[0,-1] + w[-1,0] - 4*w[0,0] + w[1,0] + w[0,1])
a = rand(1024, 1024)
b = similar(a, (size(a) .- 2))
# LoopVectorization
kern = convert(AbstractArray, Images.Kernel.Laplacian())
A = copy(a)
B = OffsetArray(similar(A, size(A).-2), 1, 1)
@btime map!($k, $b, $a, inner=true) # 11.687 ms (0 allocations: 0 bytes)
@btime filter2davx!($B, $A, $kern) # 1.848 ms (0 allocations: 0 bytes)
** edit: with @inbounds @inline, your package edges out LoopVectorization:
@inline function f(w)
return @inbounds w[0,-1] + w[-1,0] + 4*w[0,0] + w[1,0] + w[0,1]
end
k = StaticKernels.Kernel{(-1:1,-1:1)}(f)
@btime map!($k, $b, $a, inner=true) # 1.492 ms (0 allocations: 0 bytes)
*** edit 2: LoopVectorization is faster for fully-filled kernels:
kern = Images.Kernel.gaussian((1, 1), (3, 3))
const g11, g21, g31, g12, g22, g32, g13, g23, g33 = kern
@inline function f(w)
return @inbounds g11*w[-1, -1] + g21*w[0, -1] + g31*w[1, -1] +
g12*w[-1, 0] + g22*w[0, 0] + g32*w[1, 0] +
g13*w[-1, 1] + g23*w[0, 1] + g33*w[1, 1]
end
k = StaticKernels.Kernel{(-1:1,-1:1)}(f)
@btime map!($k, $b, $a, inner=true) # 2.886 ms (0 allocations: 0 bytes)
@btime filter2davx!($B, $A, $kern) # 1.775 ms (0 allocations: 0 bytes)