OK, here’s another flavor of an equal-bit-width rounding down algorithm that seems really promising performance-wise. And it’s super simple. You do a bit of bit-twiddling so only the least-significant set bit remains (an exact power of two!), and then convert that to a Float. So you can easily grab the resulting exponent! Make that negative at the right offset and we’re done.
function f32bits(r::UInt32)
last_bit = r & -r
exponent = UInt32(253)<<23 - reinterpret(UInt32, Float32(last_bit))
exponent *= !iszero(r)
fraction = (r ⊻ last_bit)>>9 # unset the trailing one and shift
return reinterpret(Float32, exponent | fraction)
end
The only expensive part here is a single Int->Float conversion — which we already do! I see about the same speed as @mikmoore’s algorithms on my M1… but the simplicity here made me curious about Intel. With a cloud Intel AVX512 system, I see no performance hit… actually with either Mikmoore’s algorithm or mine. And I don’t even need to do the LLVM code thing there (although it is helpful on my Mac for some unknown reason; and with it it’s slightly faster than the LLVM-ized version of Mikmoore’s alg).
I’ve not yet fiddled with iteratively adding more bits at the low-end, but I don’t think that should be too tough to add directly into XoshiroSimd.xoshiro_bulk_simd
.
REPL sessions
Intel with AVX
julia> using Random, BenchmarkTools
julia> v = Vector{Float32}(undef, 2^24);
julia> @benchmark rand!($v)
BenchmarkTools.Trial: 904 samples with 1 evaluation.
Range (min … max): 5.120 ms … 8.815 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 5.480 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.518 ms ± 257.506 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁▅▅▆▃▅▆█▇▆▃▅▇▄▇▁▃▄▆▂ ▂ ▃
▃▂▁▃▄▄▆▇▇████████████████████▆█▇▇██▅▇▇▇▄▆▅▃▄▃▃▃▃▃▃▂▂▃▂▁▁▁▃▂ ▅
5.12 ms Histogram: frequency by time 6.12 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @eval Random.XoshiroSimd begin
@inline _bits2float(x::UInt64, ::Type{Float32}) = _f32bits(x)
@inline function _f32bits(r::UInt32)
last_bit = r & -r
exponent = UInt32(253)<<23 - reinterpret(UInt32, Float32(last_bit))
exponent *= !iszero(r)
fraction = (r ⊻ last_bit)>>9 # unset the trailing one and shift
return exponent | fraction
end
@inline function _f32bits(x::UInt64)
ui = (x>>>32) % UInt32
li = x % UInt32
return (UInt64(_f32bits(ui)) << 32) | UInt64(_f32bits(li))
end
@inline _f32bits(x::VecElement{UInt64}) = _f32bits(x.value)
end
_f32bits (generic function with 3 methods)
julia> for N in [4,8,16]
VT = :(NTuple{$N, VecElement{UInt64}})
@eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = map(_f32bits, x)
end
julia> @benchmark rand!($v)
BenchmarkTools.Trial: 946 samples with 1 evaluation.
Range (min … max): 5.048 ms … 13.644 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 5.169 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.275 ms ± 499.710 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▆███▇▆▅▄▃▃▂▂▁▁
██████████████▇█▇▆▆▅▅▆▆▁▁▆▆▆▄▄▆▆▆▁▁▄▅▆▁▁▅▁▁▅▅▄▁▁▄▁▁▁▁▁▁▁▁▁▄ █
5.05 ms Histogram: log(frequency) by time 6.89 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
Mac M1
julia> using Random, BenchmarkTools
julia> v = Vector{Float32}(undef, 2^24);
julia> @benchmark rand!($v)
BenchmarkTools.Trial: 942 samples with 1 evaluation.
Range (min … max): 5.225 ms … 6.515 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 5.275 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.311 ms ± 125.292 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▂█▇▆▆▅▅▅▄▂▁▁▁ ▁
██████████████▇▅█▇▆▇██▅▅▆▅▅▆▆▅▆▅▁▁▆▄▄▁▄▁▄▁▁▄▁▁▁▅▄▁▄▁▁▁▁▁▁▁▄ █
5.23 ms Histogram: log(frequency) by time 5.93 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @eval Random.XoshiroSimd begin
@inline _bits2float(x::UInt64, ::Type{Float32}) = _f32bits(x)
@inline function _f32bits(r::UInt32)
last_bit = r & -r
exponent = UInt32(253)<<23 - reinterpret(UInt32, Float32(last_bit))
exponent *= !iszero(r)
fraction = (r ⊻ last_bit)>>9 # unset the trailing one and shift
return exponent | fraction
end
@inline function _f32bits(x::UInt64)
ui = (x>>>32) % UInt32
li = x % UInt32
return (UInt64(_f32bits(ui)) << 32) | UInt64(_f32bits(li))
end
@inline _f32bits(x::VecElement{UInt64}) = _f32bits(x.value)
end
_f32bits (generic function with 3 methods)
julia> for N in [4,8,16]
VT = :(NTuple{$N, VecElement{UInt64}})
@eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = map(_f32bits, x)
end
julia> @benchmark rand!($v)
BenchmarkTools.Trial: 569 samples with 1 evaluation.
Range (min … max): 8.646 ms … 13.928 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 8.689 ms ┊ GC (median): 0.00%
Time (mean ± σ): 8.792 ms ± 399.898 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
█▇▅▅▃▁▁
████████▇▇▇█▆▆▆▇▅▄▆▄▅▅▁▆▅▅▄▁▁▄▄▁▁▄▄▁▁▁▁▁▄▁▁▁▄▁▁▁▁▁▄▅▅▁▁▄▁▁▄ ▇
8.65 ms Histogram: log(frequency) by time 10.2 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> for N in [4,8,16]
VT = :(NTuple{$N, VecElement{UInt64}})
code = """
%i1 = bitcast <$N x i64> %0 to <$(2N) x i32>
%i2 = sub <$(2N) x i32> zeroinitializer, %i1
%i3 = and <$(2N) x i32> %i1, %i2
%i4 = uitofp <$(2N) x i32> %i3 to <$(2N) x float>
%i5 = bitcast <$(2N) x float> %i4 to <$(2N) x i32>
%i7 = sub <$(2N) x i32> <$(join(fill("i32 2122317824", 2N), ", "))>, %i5
%i8 = icmp eq <$(2N) x i32> %i1, zeroinitializer
%i9 = select <$(2N) x i1> %i8, <$(2N) x i32> zeroinitializer, <$(2N) x i32> %i7
%i10 = xor <$(2N) x i32> %i1, %i3
%i11 = lshr <$(2N) x i32> %i10, <$(join(fill("i32 9", 2N), ", "))>
%i12 = or <$(2N) x i32> %i9, %i11
%i13 = bitcast <$(2N) x i32> %i12 to <$N x i64>
ret <$N x i64> %i13
"""
@eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = llvmcall($code, $VT, Tuple{$VT}, x)
end
julia> @benchmark rand!($v)
BenchmarkTools.Trial: 689 samples with 1 evaluation.
Range (min … max): 7.152 ms … 10.568 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 7.201 ms ┊ GC (median): 0.00%
Time (mean ± σ): 7.260 ms ± 205.569 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁██▆▄▁
███████▇▇▇▄▆▄▆▆▄▆▄▆▄▄▆▆▅▅▅▁▄▄▄▁▄▄▁▅▁▅▅▆▄▄▅▄▄▆▆▆▆▁▅▁▁▄▁▄▁▅▁▄ ▇
7.15 ms Histogram: log(frequency) by time 8.01 ms <
Memory estimate: 0 bytes, allocs estimate: 0.