I’m trying to compute Float32 log(x) where x = (u + 0.5) / 2³² and u is a integer from 0 to 2³² - 1
I started with:
@inline function fastlog1(u::UInt32)::Float32
if u < UInt32(2)^31
x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
log(x)
else
x = fma(Float32(~u), Float32(2)^(-32), Float64(2)^(-33))
log1p(-x)
end
end
This has the nice property of never returning 0 or -Inf
Sadly this has bad effects and doesn’t seem to SIMD:
julia> Base.infer_effects(fastlog1, (UInt32,))
(+c,+e,!n,+t,+s,+m,+u,+o,+r)
Claude was able to generate the following code, does this make sense and are there any obvious ways to make this even faster?
const _SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5)))
@inline function fastlog2(u::UInt32)::Float32
x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
ix = reinterpret(Int32, x) - _SQRT_HALF_I32
k = ix >> Int32(23)
f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + _SQRT_HALF_I32) - 1.0f0
f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33))
f = ifelse(k == Int32(0), f_comp, f_std)
s = f / (2.0f0 + f)
z = s * s; w = z * z
R = z * (reinterpret(Float32, Int32(0x3f2aaaaa)) + # ≈ 2/3
w * reinterpret(Float32, Int32(0x3e91e9ee))) + # ≈ 2/7
w * (reinterpret(Float32, Int32(0x3eccce13)) + # ≈ 2/5
w * reinterpret(Float32, Int32(0x3e789e26))) # ≈ 2/9
hfsq = 0.5f0 * f * f
Float32(k) * reinterpret(Float32, Int32(0x3f317180)) -
((hfsq - (s * (hfsq + R) +
Float32(k) * reinterpret(Float32, Int32(0x3717f7d1)))) - f)
end
julia> Base.infer_effects(fastlog2, (UInt32,))
(+c,+e,+n,+t,+s,+m,+u,+o,+r)
Here is a comparison with a reference Float64 version:
function fastlogref(u::UInt32)::Float32
if u < UInt32(2)^31
x = fma(Float64(u), Float64(2)^(-32), Float64(2)^(-33))
log(x)
else
x = fma(Float64(~u), Float64(2)^(-32), Float64(2)^(-33))
log1p(-x)
end
end
julia> maximum(i -> abs(fastlogref(i) - fastlog2(i)), typemin(UInt32):typemax(UInt32))
1.9073486f-6
julia> using Chairmarks
julia> @b rand(UInt32, 1000_000) sum(fastlog1, _)
7.215 ms
julia> @b rand(UInt32, 1000_000) sum(fastlog2, _)
198.149 μs