The code:
using BenchmarkTools
function adc(a::T, b::T, carry::T) where T
T2 = widen(T)
ret = (a % T2) + (b % T2) + (carry % T2)
ret % T, (ret >> 64) % T
end
function mac(a::T, b::T, c::T, carry::T) where T
T2 = widen(T)
ret = (a % T2) + ((b % T2) * (c % T2)) + (carry % T2)
ret % T, (ret >> 64) % T
end
function generic(x::NTuple{4, T}, y::NTuple{4, T}, m::NTuple{4, T}, m_p::T) where T
a1 = zero(T)
a2 = zero(T)
a3 = zero(T)
a4 = zero(T)
a5 = zero(T)
a6 = zero(T)
for i in 1:4
u1 = (a1 + x[i] * y[1]) * m_p
(a1, carry) = mac(a1, u1, m[1], zero(T))
(a2, carry) = mac(a2, u1, m[2], carry)
(a3, carry) = mac(a3, u1, m[3], carry)
(a4, carry) = mac(a4, u1, m[4], carry)
(a5, carry2) = adc(a5, zero(T), carry)
(a1, carry) = mac(a1, x[i], y[1], zero(T))
(a2, carry) = mac(a2, x[i], y[2], carry)
(a3, carry) = mac(a3, x[i], y[3], carry)
(a4, carry) = mac(a4, x[i], y[4], carry)
(a5, a6) = adc(a5, carry2, carry)
a1 = a2
a2 = a3
a3 = a4
a4 = a5
a5 = a6
end
(a1, a2, a3, a4, a5)
end
specialized(x::NTuple{4, UInt64}, y::NTuple{4, UInt64}, m::NTuple{4, UInt64}, m_p::UInt64) =
generic(x, y, m, m_p)
x = tuple(rand(UInt64, 4)...)
y = tuple(rand(UInt64, 4)...)
m = tuple(rand(UInt64, 4)...)
m_p = rand(UInt64)
display(@benchmark generic(x, y, m, m_p))
println()
display(@benchmark specialized(x, y, m, m_p))
println()
When I run this, I get 78 ns with 1 48-byte allocation for the first (generic) call, and 52 ns with 0 allocations for the second (specialized) call. I was under the impression that the results should be identical, because the JIT compiler specializes the function on the first call. What am I missing here?