More accurate evalpoly

<2x slower for me with a degree 6 test polynomial (7 coefficients).

julia> function evalpolyloop!(f, y, x)
           @inbounds for i ∈ eachindex(y,x)
               y[i] = f(x[i], (0.6666666666667333541, 0.3999999999635251990, 0.2857142932794299317, 0.2222214519839380009, 0.1818605932937785996, 0.1525629051003428716, 0.1532076988502701353))
           end
       end
evalpolyloop! (generic function with 1 method)

julia> x = rand(256) .+ 0.5; y = similar(x);

julia> @inline function exthorner(x, p::Tuple)
           hi, lo = p[end], zero(x)
           for i in length(p)-1:-1:1
               pi = p[i]
               prod = hi*x
               err = fma(hi, x, -prod)
               hi = pi+prod
               lo = fma(lo, x, prod - (hi - pi) + err)
           end
           return hi, lo
       end
exthorner (generic function with 1 method)

julia> @benchmark evalpolyloop!(evalpoly, $y, $x)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     37.663 ns (0.00% GC)
  median time:      37.959 ns (0.00% GC)
  mean time:        38.003 ns (0.00% GC)
  maximum time:     90.504 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     992

julia> @benchmark evalpolyloop!((a,b) -> first(exthorner(a,b)), $y, $x)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     63.381 ns (0.00% GC)
  median time:      63.487 ns (0.00% GC)
  mean time:        63.574 ns (0.00% GC)
  maximum time:     99.733 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     980
5 Likes