Performance of naive convolution against Python Numpy

The naive loop is faster than numpy, just make sure it gets SIMD’ed:

function naive_convol_full!(w, u, v)
    if length(u) < length(v)
        return naive_convol_full!(w, v, u)
    end
    n = length(u)
    m = length(v)
    for i = 1:n+m-1
        s = zero(eltype(u))
        @simd for j = max(i-m,0):min(n,i)-1
            s += u[j+1] * v[i-j]
        end
        w[i] = s
    end
end

@btime naive_convol_full!(D,A,B) setup=(A=rand(10000); B=rand(10000); D=zeros(length(A)+length(B)-1)) evals=1
  15.383 ms (0 allocations: 0 bytes)

And if you use @tturbo from LoopVectorizations , you get it even faster:

    ...
        @tturbo for j = max(i-m,0):min(n,i)-1
            s += u[j+1] * v[i-j]
        end
    ...

@btime naive_convol_full!(D,A,B) setup=(A=rand(10000); B=rand(10000); D=zeros(length(A)+length(B)-1)) evals=1
  9.398 ms (0 allocations: 0 bytes)
11 Likes