Manually specialized function is faster than the generic version

The code:

using BenchmarkTools


function adc(a::T, b::T, carry::T) where T
    T2 = widen(T)
    ret = (a % T2) + (b % T2) + (carry % T2)
    ret % T, (ret >> 64) % T
end


function mac(a::T, b::T, c::T, carry::T) where T
    T2 = widen(T)
    ret = (a % T2) + ((b % T2) * (c % T2)) + (carry % T2)
    ret % T, (ret >> 64) % T
end


function generic(x::NTuple{4, T}, y::NTuple{4, T}, m::NTuple{4, T}, m_p::T) where T

    a1 = zero(T)
    a2 = zero(T)
    a3 = zero(T)
    a4 = zero(T)
    a5 = zero(T)
    a6 = zero(T)

    for i in 1:4
        u1 = (a1 + x[i] * y[1]) * m_p
        (a1, carry) = mac(a1, u1, m[1], zero(T))
        (a2, carry) = mac(a2, u1, m[2], carry)
        (a3, carry) = mac(a3, u1, m[3], carry)
        (a4, carry) = mac(a4, u1, m[4], carry)
        (a5, carry2) = adc(a5, zero(T), carry)

        (a1, carry) = mac(a1, x[i], y[1], zero(T))
        (a2, carry) = mac(a2, x[i], y[2], carry)
        (a3, carry) = mac(a3, x[i], y[3], carry)
        (a4, carry) = mac(a4, x[i], y[4], carry)
        (a5, a6) = adc(a5, carry2, carry)

        a1 = a2
        a2 = a3
        a3 = a4
        a4 = a5
        a5 = a6
    end

    (a1, a2, a3, a4, a5)
end


specialized(x::NTuple{4, UInt64}, y::NTuple{4, UInt64}, m::NTuple{4, UInt64}, m_p::UInt64) =
    generic(x, y, m, m_p)


x = tuple(rand(UInt64, 4)...)
y = tuple(rand(UInt64, 4)...)
m = tuple(rand(UInt64, 4)...)
m_p = rand(UInt64)


display(@benchmark generic(x, y, m, m_p))
println()

display(@benchmark specialized(x, y, m, m_p))
println()

When I run this, I get 78 ns with 1 48-byte allocation for the first (generic) call, and 52 ns with 0 allocations for the second (specialized) call. I was under the impression that the results should be identical, because the JIT compiler specializes the function on the first call. What am I missing here?

You have to use $ on variables when using BenchmarkTools

julia> @benchmark generic($x, $y, $m, $m_p)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     39.406 ns (0.00% GC)
  median time:      39.665 ns (0.00% GC)
  mean time:        42.901 ns (0.00% GC)
  maximum time:     216.663 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     991

julia> @benchmark specialized($x, $y, $m, $m_p)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     39.321 ns (0.00% GC)
  median time:      39.838 ns (0.00% GC)
  mean time:        42.475 ns (0.00% GC)
  maximum time:     541.635 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     991

Thank you! Usually I have these benchmarks inside functions, and @benchmark complains if variables are not quoted.