Hello all!

I’m dusting off a one-function package I made a while ago, but I want to confirm that the benchmarks are honest. There’s some calls to scipy objects, and I would like to know if I’m measuring any sort of unfair conversions in the code.

The code is from StagedFilters.jl, but it’s a single function here:

```
module StagedFilters
"""
Savitzky-Golay filter of window half-width M and degree
N. M is the number of points before and after to interpolate, i.e. the full
width of the window is 2M+1.
"""
abstract type AbstractStagedFilters end
struct SavitzkyGolayFilter{M,N} <: AbstractStagedFilters end
wrapL(i, n) = ifelse(1 ≤ i, i, i + n)
wrapR(i, n) = ifelse(i ≤ n, i, i - n)
"""
smooth!(filter,data, smoothed) -
apply `filter` to `data` writing result to `smoothed`.
Note that feeding `Int`s and not floats as data will result in a performance slowdown.
"""
@generated function smooth!(::Type{SavitzkyGolayFilter{M,N}}, data :: AbstractArray{T}, smoothed :: AbstractArray{S}) where {M,N,T,S}
J = T[(i - M - 1 )^(j - 1) for i = 1:2M + 1, j = 1:N + 1]
e₁ = [one(T); zeros(T,N)]
C = J' \ e₁
pre = :(for i = 1:$M end)
main = :(for i = $(M + 1):n - $M end)
post = :(for i = n - $(M - 1):n end)
for loop in (pre, main, post)
body = loop.args[2].args
idx = loop !== pre ? :(i - $M) : :(wrapL(i - $M, n)) # Manually start the first iteration. See the "false" branch below.
push!(body, :( x = muladd($(C[1]), data[$idx], $(zero(T))))) # Swap `muladd` instead of the additions. Note the index of 1.
for j = reverse(1:M-1) # Because we bumped out the first iteration, we have to reduce the for loop index by one.
idx = loop !== pre ? :(i - $j) : :(wrapL(i - $j, n))
push!(body, :( x = muladd($(C[M + 1 - j]),data[$idx],x))) # muladd
end
push!(body, :( x = muladd($(C[M + 1]), data[i], x))) # muladd
for j = 1:M
idx = loop !== post ? :(i + $j) : :(wrapR(i + $j, n))
push!(body, :( x = muladd($(C[M + 1 + j]), data[$idx], x))) # muladd
end
push!(body, :(smoothed[i] = x))
end
last_expr = quote
n = length(data)
n == length(smoothed) || throw(DimensionMismatch())
@inbounds $pre; @inbounds $main; @inbounds $post
return smoothed
end
return last_expr = Base.remove_linenums!(last_expr)
end;
export SavitzkyGolayFilter, smooth!
end # module
```

The benchmark script is the following:

```
using StagedFilters
using Test
using PyCall, BenchmarkTools;
N = 1_000_000
data = convert.(Float64,collect(range(1,N,length=N)));
smoothed = zeros(eltype(data),length(data)); # <--- wholesome, type stable code.
savgol = pyimport("scipy.signal")."savgol_filter";
x = PyObject(data);
@info "Julia f64x$N"
@btime smooth!(SavitzkyGolayFilter{2,2}, $data, $smoothed);
@info "SciPy f64x$N"
@btime pycall($savgol, PyObject, $x,5,2,mode="wrap");
data = convert.(Float32,collect(range(1,N,length=N)));
smoothed = zeros(eltype(data),length(data)); # <--- wholesome, type stable$
savgol = pyimport("scipy.signal")."savgol_filter";
x = PyObject(data);
@info "Julia f32x$N"
@btime smooth!(SavitzkyGolayFilter{2,2}, $data, $smoothed);
@info "SciPy f32x$N"
@btime pycall($savgol, PyObject, $x,5,2,mode="wrap");
nothing;
```

Which on my computer leads to:

```
[ Info: Julia f64x1000000
1.267 ms (0 allocations: 0 bytes)
[ Info: SciPy f64x1000000
11.432 ms (20 allocations: 1.02 KiB)
[ Info: Julia f32x1000000
491.619 μs (0 allocations: 0 bytes)
[ Info: SciPy f32x1000000
4.984 ms (20 allocations: 1.02 KiB)
```

Which is a ~10x speedup over Scipy on arrays that big… HOWEVER, When I run the same script under `]test StagedFilters`

, I get only a ~2x difference in performance. What gives?