[ANN]: StaticKernels.jl - Fast Kernel Operations on Arrays

This looks great! You may want to harness some of @elrod’s work in LoopVectorization:

using StaticKernels, LoopVectorization, Images, OffsetArrays, BenchmarkTools

function filter2davx!(out::AbstractMatrix, A::AbstractMatrix, kern)
    @avx for J in CartesianIndices(out)
        tmp = zero(eltype(out))
        for I ∈ CartesianIndices(kern)
            tmp += A[I + J] * kern[I]
        end
        out[J] = tmp
    end
    out
end

# StaticKernels
k = StaticKernels.Kernel{(-1:1,-1:1)}(w -> w[0,-1] + w[-1,0] - 4*w[0,0] + w[1,0] + w[0,1])
a = rand(1024, 1024)
b = similar(a, (size(a) .- 2))

# LoopVectorization
kern = convert(AbstractArray, Images.Kernel.Laplacian())
A = copy(a)
B = OffsetArray(similar(A, size(A).-2), 1, 1)
@btime map!($k, $b, $a, inner=true) # 11.687 ms (0 allocations: 0 bytes)
@btime filter2davx!($B, $A, $kern)  # 1.848 ms (0 allocations: 0 bytes)

** edit: with @inbounds @inline, your package edges out LoopVectorization:

@inline function f(w)
    return @inbounds w[0,-1] + w[-1,0] + 4*w[0,0] + w[1,0] + w[0,1]
end
k = StaticKernels.Kernel{(-1:1,-1:1)}(f)

@btime map!($k, $b, $a, inner=true) #  1.492 ms (0 allocations: 0 bytes)

*** edit 2: LoopVectorization is faster for fully-filled kernels:

kern = Images.Kernel.gaussian((1, 1), (3, 3))

const g11, g21, g31, g12, g22, g32, g13, g23, g33 = kern
@inline function f(w)
    return @inbounds g11*w[-1, -1] + g21*w[0, -1] + g31*w[1, -1] +
                     g12*w[-1,  0] + g22*w[0,  0] + g32*w[1,  0] +
                     g13*w[-1,  1] + g23*w[0,  1] + g33*w[1,  1]
end
k = StaticKernels.Kernel{(-1:1,-1:1)}(f)

@btime map!($k, $b, $a, inner=true) # 2.886 ms (0 allocations: 0 bytes)
@btime filter2davx!($B, $A, $kern)  # 1.775 ms (0 allocations: 0 bytes)
5 Likes