The 4 is for SIMD?
This is very neat:
using ForwardDiff, BenchmarkTools, LinearAlgebra
_dual(x) = ForwardDiff.Dual(x, x + 1, x + 2, x + 3, x + 4)
function dot4(a, b, len_trunc)
z1 = z2 = z3 = z4 = zero(a[1] * b[1])
i = 1
@inbounds while i < len_trunc
z1 += a[i] * b[i]
z2 += a[i+1] * b[i+1]
z3 += a[i+2] * b[i+2]
z4 += a[i+3] * b[i+3]
i += 4
end
z1 + z2 + z3 + z4
end
function dot_tail(z, a, b, i, len)
@inbounds while i < len
i += 1
z += a[i] * b[i]
end
z
end
function mydot(a, b)
@assert !Base.has_offset_axes(a, b)
len = length(a)
@assert len == length(b)
len_trunc = len - len % 4
z = dot4(a, b, len_trunc)
dot_tail(z, a, b, len_trunc, len)
end
o = randn(69)
Do = _dual.(o)
@btime dot(o, Do) # 97ns
@btime mydot(o, Do) # 69ns