Fast fixed point log

I’m trying to compute Float32 log(x) where x = (u + 0.5) / 2³² and u is a integer from 0 to 2³² - 1
I started with:

@inline function fastlog1(u::UInt32)::Float32
    if u < UInt32(2)^31
        x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
        log(x)
    else
        x = fma(Float32(~u), Float32(2)^(-32), Float64(2)^(-33))
        log1p(-x)
    end
end

This has the nice property of never returning 0 or -Inf

Sadly this has bad effects and doesn’t seem to SIMD:

julia> Base.infer_effects(fastlog1, (UInt32,))
(+c,+e,!n,+t,+s,+m,+u,+o,+r)

Claude was able to generate the following code, does this make sense and are there any obvious ways to make this even faster?

const _SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5)))

@inline function fastlog2(u::UInt32)::Float32
    x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
    ix = reinterpret(Int32, x) - _SQRT_HALF_I32
    k = ix >> Int32(23)
    f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + _SQRT_HALF_I32) - 1.0f0
    f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33))
    f = ifelse(k == Int32(0), f_comp, f_std)
    s = f / (2.0f0 + f)
    z = s * s; w = z * z
    R = z * (reinterpret(Float32, Int32(0x3f2aaaaa)) +       # ≈ 2/3
             w * reinterpret(Float32, Int32(0x3e91e9ee))) +   # ≈ 2/7
        w * (reinterpret(Float32, Int32(0x3eccce13)) +        # ≈ 2/5
             w * reinterpret(Float32, Int32(0x3e789e26)))      # ≈ 2/9
    hfsq = 0.5f0 * f * f
    Float32(k) * reinterpret(Float32, Int32(0x3f317180)) -
        ((hfsq - (s * (hfsq + R) +
          Float32(k) * reinterpret(Float32, Int32(0x3717f7d1)))) - f)
end
julia> Base.infer_effects(fastlog2, (UInt32,))
(+c,+e,+n,+t,+s,+m,+u,+o,+r)

Here is a comparison with a reference Float64 version:

function fastlogref(u::UInt32)::Float32
    if u < UInt32(2)^31
        x = fma(Float64(u), Float64(2)^(-32), Float64(2)^(-33))
        log(x)
    else
        x = fma(Float64(~u), Float64(2)^(-32), Float64(2)^(-33))
        log1p(-x)
    end
end
julia> maximum(i -> abs(fastlogref(i) - fastlog2(i)), typemin(UInt32):typemax(UInt32))
1.9073486f-6

julia> using Chairmarks

julia> @b rand(UInt32, 1000_000) sum(fastlog1, _)
7.215 ms

julia> @b rand(UInt32, 1000_000) sum(fastlog2, _)
198.149 μs