Output distribution of rand(Float32) and rand(Float64), thread 2

OK, here’s another flavor of an equal-bit-width rounding down algorithm that seems really promising performance-wise. And it’s super simple. You do a bit of bit-twiddling so only the least-significant set bit remains (an exact power of two!), and then convert that to a Float. So you can easily grab the resulting exponent! Make that negative at the right offset and we’re done.

function f32bits(r::UInt32)
    last_bit = r & -r
    exponent = UInt32(253)<<23 - reinterpret(UInt32, Float32(last_bit))
    exponent *= !iszero(r)
    fraction = (r ⊻ last_bit)>>9 # unset the trailing one and shift
    return reinterpret(Float32, exponent | fraction)
end

The only expensive part here is a single Int->Float conversion — which we already do! I see about the same speed as @mikmoore’s algorithms on my M1… but the simplicity here made me curious about Intel. With a cloud Intel AVX512 system, I see no performance hit… actually with either Mikmoore’s algorithm or mine. And I don’t even need to do the LLVM code thing there (although it is helpful on my Mac for some unknown reason; and with it it’s slightly faster than the LLVM-ized version of Mikmoore’s alg).

I’ve not yet fiddled with iteratively adding more bits at the low-end, but I don’t think that should be too tough to add directly into XoshiroSimd.xoshiro_bulk_simd.

REPL sessions

Intel with AVX

julia> using Random, BenchmarkTools

julia> v = Vector{Float32}(undef, 2^24);

julia> @benchmark rand!($v)
BenchmarkTools.Trial: 904 samples with 1 evaluation.
 Range (min … max):  5.120 ms …   8.815 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.480 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.518 ms ± 257.506 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

           ▁▅▅▆▃▅▆█▇▆▃▅▇▄▇▁▃▄▆▂ ▂  ▃                           
  ▃▂▁▃▄▄▆▇▇████████████████████▆█▇▇██▅▇▇▇▄▆▅▃▄▃▃▃▃▃▃▂▂▃▂▁▁▁▃▂ ▅
  5.12 ms         Histogram: frequency by time        6.12 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @eval Random.XoshiroSimd begin
       @inline _bits2float(x::UInt64, ::Type{Float32}) = _f32bits(x)

       @inline function _f32bits(r::UInt32)
                  last_bit = r & -r
                  exponent = UInt32(253)<<23 - reinterpret(UInt32, Float32(last_bit))
                  exponent *= !iszero(r)
                  fraction = (r ⊻ last_bit)>>9 # unset the trailing one and shift
                  return exponent | fraction
       end
       @inline function _f32bits(x::UInt64)
           ui = (x>>>32) % UInt32
           li = x % UInt32
           return (UInt64(_f32bits(ui)) << 32) | UInt64(_f32bits(li))
       end
       @inline _f32bits(x::VecElement{UInt64}) = _f32bits(x.value)
       end
_f32bits (generic function with 3 methods)

julia> for N in [4,8,16]
                         VT = :(NTuple{$N, VecElement{UInt64}})
                         @eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = map(_f32bits, x)
                     end

julia> @benchmark rand!($v)
BenchmarkTools.Trial: 946 samples with 1 evaluation.
 Range (min … max):  5.048 ms …  13.644 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.169 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.275 ms ± 499.710 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▆███▇▆▅▄▃▃▂▂▁▁                                               
  ██████████████▇█▇▆▆▅▅▆▆▁▁▆▆▆▄▄▆▆▆▁▁▄▅▆▁▁▅▁▁▅▅▄▁▁▄▁▁▁▁▁▁▁▁▁▄ █
  5.05 ms      Histogram: log(frequency) by time      6.89 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

Mac M1

julia> using Random, BenchmarkTools

julia> v = Vector{Float32}(undef, 2^24);

julia> @benchmark rand!($v)
BenchmarkTools.Trial: 942 samples with 1 evaluation.
 Range (min … max):  5.225 ms …   6.515 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.275 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.311 ms ± 125.292 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▂█▇▆▆▅▅▅▄▂▁▁▁        ▁
  ██████████████▇▅█▇▆▇██▅▅▆▅▅▆▆▅▆▅▁▁▆▄▄▁▄▁▄▁▁▄▁▁▁▅▄▁▄▁▁▁▁▁▁▁▄ █
  5.23 ms      Histogram: log(frequency) by time      5.93 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @eval Random.XoshiroSimd begin
       @inline _bits2float(x::UInt64, ::Type{Float32}) = _f32bits(x)

       @inline function _f32bits(r::UInt32)
                  last_bit = r & -r
                  exponent = UInt32(253)<<23 - reinterpret(UInt32, Float32(last_bit))
                  exponent *= !iszero(r)
                  fraction = (r ⊻ last_bit)>>9 # unset the trailing one and shift
                  return exponent | fraction
       end
       @inline function _f32bits(x::UInt64)
           ui = (x>>>32) % UInt32
           li = x % UInt32
           return (UInt64(_f32bits(ui)) << 32) | UInt64(_f32bits(li))
       end
       @inline _f32bits(x::VecElement{UInt64}) = _f32bits(x.value)
       end
_f32bits (generic function with 3 methods)

julia> for N in [4,8,16]
                         VT = :(NTuple{$N, VecElement{UInt64}})
                         @eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = map(_f32bits, x)
                     end

julia> @benchmark rand!($v)
BenchmarkTools.Trial: 569 samples with 1 evaluation.
 Range (min … max):  8.646 ms …  13.928 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     8.689 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   8.792 ms ± 399.898 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▇▅▅▃▁▁
  ████████▇▇▇█▆▆▆▇▅▄▆▄▅▅▁▆▅▅▄▁▁▄▄▁▁▄▄▁▁▁▁▁▄▁▁▁▄▁▁▁▁▁▄▅▅▁▁▄▁▁▄ ▇
  8.65 ms      Histogram: log(frequency) by time      10.2 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> for N in [4,8,16]
           VT = :(NTuple{$N, VecElement{UInt64}})
           code = """
               %i1 = bitcast <$N x i64> %0 to <$(2N) x i32>
               %i2 = sub <$(2N) x i32> zeroinitializer, %i1
               %i3 = and <$(2N) x i32> %i1, %i2
               %i4 = uitofp <$(2N) x i32> %i3 to <$(2N) x float>
               %i5 = bitcast <$(2N) x float> %i4 to <$(2N) x i32>
               %i7 = sub <$(2N) x i32> <$(join(fill("i32 2122317824", 2N), ", "))>, %i5
               %i8 = icmp eq <$(2N) x i32> %i1, zeroinitializer
               %i9 = select <$(2N) x i1> %i8, <$(2N) x i32> zeroinitializer, <$(2N) x i32> %i7
               %i10 = xor <$(2N) x i32> %i1, %i3
               %i11 = lshr <$(2N) x i32> %i10, <$(join(fill("i32 9", 2N), ", "))>
               %i12 = or <$(2N) x i32> %i9, %i11
               %i13 = bitcast <$(2N) x i32> %i12 to <$N x i64>
               ret <$N x i64> %i13
           """
           @eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = llvmcall($code, $VT, Tuple{$VT}, x)
       end

julia> @benchmark rand!($v)
BenchmarkTools.Trial: 689 samples with 1 evaluation.
 Range (min … max):  7.152 ms …  10.568 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     7.201 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   7.260 ms ± 205.569 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

   ▁██▆▄▁
  ███████▇▇▇▄▆▄▆▆▄▆▄▆▄▄▆▆▅▅▅▁▄▄▄▁▄▄▁▅▁▅▅▆▄▄▅▄▄▆▆▆▆▁▅▁▁▄▁▄▁▅▁▄ ▇
  7.15 ms      Histogram: log(frequency) by time      8.01 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.
5 Likes