I’m working on developing a solver that requires piecewise-defined polynomial functions to be evaluated very frequently. The easiest way to define such a function would be something like
function piecewise_hardcoded(x)
    if x < -1
        return 0.0
    elseif x < 0
        return 1.0 - x^2
    elseif x < 1
        return x^2 -1.0
    else
        return 0.0
    end
end
This performs very well:
julia> @btime piecewise_hardcoded(x) setup=(x=4*rand()-2)
2.499 ns (0 allocations: 0 bytes)
However, I’d like to be able to take arbitrary derivatives and integrals of such functions as well without rewriting the whole function, so I want to be able to store the functions and breakpoints in a struct like so:
using Polynomials
struct PiecewisePolynomial{P<:AbstractPolynomial, T<:Real} <: Function
    polynomials::Vector{P}
    breakpoints::Vector{T}
end
function evalpiecewise(functions, breakpoints, x)
    for (i, b) in enumerate(breakpoints)
        if b < x
            return functions[i](x)
        end
    end
    return functions[end](x)
end
(p::PiecewisePolynomial)(x) = evalpiecewise(p.polynomials, p.breakpoints, x)
const polynomials = [
    Polynomial([0.0]), Polynomial([1.0, 0.0, -1.0]), Polynomial([-1.0, 0.0, 1.0]), Polynomial([0.0])
]
const breakpoints = [-1.0, 0.0, 1.0]
const mypiecewise = PiecewisePolynomial(polynomials, breakpoints)
This is reasonably fast, but still 5-10 times slower than the hardcoded version
julia> @btime mypiecewise(x) setup=(x=4*rand()-2)
17.734 ns (0 allocations: 0 bytes)
My next idea was to compute a closure for the evaluation and store that in the struct, in the hopes that the compiler could do some constant propagation on evalfunc
struct PiecewisePolynomial_2{F<:Function, P<:AbstractPolynomial, T<:Real} <: Function
    polynomials::Vector{P}
    breakpoints::Vector{T}
    evalfunc::F
    function PiecewisePolynomial_2(polynomials::Vector{P}, breakpoints::Vector{T}) where {P, T}
        # define closure
        evalfunc = let p = polynomials, b = breakpoints
            x -> evalpiecewise(p, b, x)
        end
        return new{typeof(evalfunc), P, T}(polynomials, breakpoints, evalfunc)
    end
end
(p::PiecewisePolynomial_2)(x) = p.evalfunc(x)
const mypiecewise_withclosure = PiecewisePolynomial_2(polynomials, breakpoints)
However, this did not measurably impact performance
julia> @btime mypiecewise_withclosure(x) setup=(x=4*rand()-2)
17.734 ns (0 allocations: 0 bytes)
The last thing I tried was using NTuples instead of vectors so that the number of segments is inferrable from the type:
struct PiecewisePolynomial_Tuples{F<:Function, P<:AbstractPolynomial, T<:Real, M} <: Function
    polynomials::NTuple{N, P} where N
    breakpoints::NTuple{M, T}
    evalfunc::F
        
    function PiecewisePolynomial_Tuples(polynomials::NTuple{N, P}, breakpoints::NTuple{M, T}) where {P, N, T, M}
        @assert N == M + 1
        @assert issorted(breakpoints)
        evalfunc = let p = polynomials, b = breakpoints
           x -> evalpiecewise(polynomials, breakpoints, x)
        end
        return new{typeof(evalfunc), P, T, M}(polynomials, breakpoints, evalfunc) 
    end
end
const poly_tuple = (
    Polynomial([0.0]), Polynomial([1.0, 0.0, -1.0]), Polynomial([-1.0, 0.0, 1.0]), Polynomial([0.0])
)
const breakpoint_tuple = (-1.0, 0.0, 1.0)
const mypiecewise_tuples = PiecewisePolynomial_Tuples(poly_tuple, breakpoint_tuple)
and that got me another 5 ns over the vectors, but it’s still 6 times slower than the hardcoded version.
julia> @btime mypiecewise_tuples(x) setup=(x=4*rand()-2)
12.512 ns (0 allocations: 0 bytes)
Does anyone have any ideas for further speedups?