It is my understanding that hardware operations do not always take the same amount of cycles on a given type. For instance it seems that fma
is slower on subnormal floats then on normal floats.
I wanted to observe this effect on a hot simd loop. Here is my benchmark:
function _reduce(op, v)
ret = first(v)
@inbounds @simd for x in v
ret = op(ret,x,)
end
ret
end
function printop(io::IO, op, args...)
print(io, op, "(")
for arg in args
print(io, arg, ", ")
end
println(io, ")")
end
printop(op, args...) = printop(stdout, op, args...)
function reducebench(op, N::Integer, arg)
v = fill(arg, N)
_reduce(op, v)
printop(op, arg)
@time _reduce(op, v)
end
fma112(a,b) = fma(a,a,b)
N = 10^7
for T in [Float32 ]
for op in [+, *, min, fma112]
println('*'^20, " $op(::$T, ::$T) ", '*'^20)
for arg in [nextfloat(zero(T)), one(T), T(NaN), T(Inf)]
reducebench(op, N, arg)
end
end
end
So it seems only fma
on subnormal is slow, there are no other bad combinations. Does that sound right? Is this benchmark sane or am I measureing something wrong? Are there other interesting cases of operations being slow on certain arguments?
******************** +(::Float32, ::Float32) ********************
+(1.0e-45, )
0.009499 seconds (1 allocation: 16 bytes)
+(1.0, )
0.009120 seconds (1 allocation: 16 bytes)
+(NaN, )
0.010403 seconds (1 allocation: 16 bytes)
+(Inf, )
0.009389 seconds (1 allocation: 16 bytes)
******************** *(::Float32, ::Float32) ********************
*(1.0e-45, )
0.014134 seconds (1 allocation: 16 bytes)
*(1.0, )
0.013916 seconds (1 allocation: 16 bytes)
*(NaN, )
0.014362 seconds (1 allocation: 16 bytes)
*(Inf, )
0.013973 seconds (1 allocation: 16 bytes)
******************** min(::Float32, ::Float32) ********************
min(1.0e-45, )
0.019460 seconds (1 allocation: 16 bytes)
min(1.0, )
0.019750 seconds (1 allocation: 16 bytes)
min(NaN, )
0.022992 seconds (1 allocation: 16 bytes)
min(Inf, )
0.019744 seconds (1 allocation: 16 bytes)
******************** fma112(::Float32, ::Float32) ********************
fma112(1.0e-45, )
0.411726 seconds (1 allocation: 16 bytes)
fma112(1.0, )
0.013925 seconds (1 allocation: 16 bytes)
fma112(NaN, )
0.013971 seconds (1 allocation: 16 bytes)
fma112(Inf, )
0.013883 seconds (1 allocation: 16 bytes)