I want to evaluate a polynomial of degree 8. Two ways of doing this:
@inline function standard_eval(x)
p = evalpoly(x, (0.16666666666666632, 0.04166666666666556, 0.008333333333401227,
0.001388888889068783, 0.00019841269447671544, 2.480157691845342e-5,
2.7558212415361945e-6, 2.758218402815439e-7, 2.4360682937111612e-8))
end
@inline function eval_ps(x)
xx=x * x
xxx=xx * x
X3_1=muladd(2.4360682937111612e-8,xx,muladd(2.758218402815439e-7,x,2.7558212415361945e-6))
X4_1=muladd(xx,2.480157691845342e-5,muladd(0.00019841269447671544,x,0.001388888889068783));
Y_1=muladd(0.008333333333401227,xx,muladd(0.04166666666666556,x,0.16666666666666632))
Y=muladd(xxx,muladd(xxx,X3_1,X4_1),Y_1)
return Y
end
The evalpoly
is based on Horner scheme and eval_ps
is a hardcoded version of the Paterson-Stockmayer approach. They correspond to the evaluatation of the same polynomial
julia> standard_eval(0.1)-eval_ps(0.1)
-2.7755575615628914e-17
By using @code_native
we see that standard_eval
contains 8 fma-operations and eval_ps
contains 6 fma-operations.
However, I don’t see a difference in performance:
julia> @btime eval_ps(0.3);
0.036 ns (0 allocations: 0 bytes)
julia> @btime standard_eval(0.3);
0.036 ns (0 allocations: 0 bytes)
Why is this? Can this be improved?
julia> @code_native eval_ps(22.0)
.text
; ┌ @ REPL[94]:2 within `eval_ps'
; │┌ @ float.jl:356 within `*'
vmulsd %xmm0, %xmm0, %xmm1
; │└
; │ @ REPL[94]:5 within `eval_ps'
; │┌ @ float.jl:363 within `muladd'
vmovddup %xmm0, %xmm2 # xmm2 = xmm0[0,0]
movabsq $.rodata.cst16, %rax
vmovapd (%rax), %xmm3
movabsq $140349345069936, %rax # imm = 0x7FA5A0DB7B70
vfmadd213pd (%rax), %xmm2, %xmm3 # xmm3 = (xmm2 * xmm3) + mem
vmovddup %xmm1, %xmm2 # xmm2 = xmm1[0,0]
movabsq $140349345069952, %rax # imm = 0x7FA5A0DB7B80
vfmadd132pd (%rax), %xmm3, %xmm2 # xmm2 = (xmm2 * mem) + xmm3
; │└
; │ @ REPL[94]:3 within `eval_ps'
; │┌ @ float.jl:356 within `*'
vmulsd %xmm0, %xmm1, %xmm3
movabsq $.rodata.cst8, %rax
vmovsd (%rax), %xmm4 # xmm4 = mem[0],zero
movabsq $140349345069976, %rax # imm = 0x7FA5A0DB7B98
; │└
; │ @ REPL[94]:7 within `eval_ps'
; │┌ @ float.jl:363 within `muladd'
vfmadd213sd (%rax), %xmm0, %xmm4 # xmm4 = (xmm0 * xmm4) + mem
movabsq $140349345069984, %rax # imm = 0x7FA5A0DB7BA0
vfmadd231sd (%rax), %xmm1, %xmm4 # xmm4 = (xmm1 * mem) + xmm4
; │└
; │ @ REPL[94]:8 within `eval_ps'
; │┌ @ float.jl:363 within `muladd'
vpermilpd $1, %xmm2, %xmm0 # xmm0 = xmm2[1,0]
vfmadd213sd %xmm2, %xmm3, %xmm0 # xmm0 = (xmm3 * xmm0) + xmm2
vfmadd213sd %xmm4, %xmm3, %xmm0 # xmm0 = (xmm3 * xmm0) + xmm4
; │└
; │ @ REPL[94]:9 within `eval_ps'
retq
nopl (%rax)
; â””