The naive loop is faster than numpy, just make sure it gets SIMD’ed:
function naive_convol_full!(w, u, v)
if length(u) < length(v)
return naive_convol_full!(w, v, u)
end
n = length(u)
m = length(v)
for i = 1:n+m-1
s = zero(eltype(u))
@simd for j = max(i-m,0):min(n,i)-1
s += u[j+1] * v[i-j]
end
w[i] = s
end
end
@btime naive_convol_full!(D,A,B) setup=(A=rand(10000); B=rand(10000); D=zeros(length(A)+length(B)-1)) evals=1
15.383 ms (0 allocations: 0 bytes)
And if you use @tturbo from LoopVectorizations , you get it even faster:
...
@tturbo for j = max(i-m,0):min(n,i)-1
s += u[j+1] * v[i-j]
end
...
@btime naive_convol_full!(D,A,B) setup=(A=rand(10000); B=rand(10000); D=zeros(length(A)+length(B)-1)) evals=1
9.398 ms (0 allocations: 0 bytes)