Output distribution of rand(Float32) and rand(Float64), thread 2

Here’s the magic to jam @mikmoore’s latest Float32 N=32 version into XoshiroSimd, using a very naive copy-the-code_llvm approach:

magic

Here’s the complete repl session:

▶ ./julia
               _
   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.11.0-DEV.595 (2023-10-03)
 _/ |\__'_|_|_|\__'_|  |  Commit b790cf8f0c* (26 days old master)
|__/                   |

julia> using Random, BenchmarkTools

julia> A = Array{Float32}(undef, 2^24);

julia> @benchmark rand!(A)
BenchmarkTools.Trial: 949 samples with 1 evaluation.
 Range (min … max):  5.200 ms …  6.274 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.240 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.268 ms ± 83.680 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

     ▃█▅▃▁▂
  ▄▅▅███████▆▇▆▆▆▇▅▆▆▆▄▆▅▇▆▆▅▄▄▅▁▅▄▄▅▄▁▄▄▁▁▄▄▄▁▄▁▁▁▁▄▁▄▁▁▄▁▄ ▇
  5.2 ms       Histogram: log(frequency) by time      5.7 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @eval Random.XoshiroSimd begin
       @inline _bits2float(x::UInt64, ::Type{Float32}) = _f32bits(x)

       @inline function _f32bits(u::UInt32)
               # map UIntXX linearly into FloatXX on range [0,1)
               U = UInt32
               F = Base.floattype(U)
               bitwidth = 8 * sizeof(U) # bits in type U
               lomask = typemax(U) >>> (bitwidth ÷ 2) # mask of low bits of u
               scale = F(exp2(-bitwidth))
               lo = F(u &  lomask) * scale # must scale here for UInt16, because Float16(typemax(UInt16)) overflows
               hi = F(u & ~lomask) * scale # must scale here for UInt16, because Float16(typemax(UInt16)) overflows
               f0 = lo + hi # full-width value
               efflo = f0 - hi # get effective value of lo used
               roundedup = efflo > lo # determine whether f0 was rounded upward
               fu = reinterpret(U, f0)
               fu -= roundedup # if we rounded up, decrement to round down instead
               return fu
       end
       @inline function _f32bits(x::UInt64)
           ui = (x>>>32) % UInt32
           li = x % UInt32
           return (UInt64(_f32bits(ui)) << 32) | UInt64(_f32bits(li))
       end
       @inline _f32bits(x::VecElement{UInt64}) = _f32bits(x.value)
       end
       for N in [4,8,16]
           VT = :(NTuple{$N, VecElement{UInt64}})
           code = """
               %a = bitcast <$N x i64> %0 to <$(2N) x i32>
               %lowmask = shufflevector <1 x i32> <i32 65535>, <1 x i32> undef, <$(2N) x i32> zeroinitializer
               %b = and <$(2N) x i32> %a, %lowmask
               %c = uitofp <$(2N) x i32> %b to <$(2N) x float>
               %scale = shufflevector <1 x float> <float 0x3DF0000000000000>, <1 x float> undef, <$(2N) x i32> zeroinitializer
               %d = fmul <$(2N) x float> %c, %scale
               %himask = shufflevector <1 x i32> <i32 -65536>, <1 x i32> undef, <$(2N) x i32> zeroinitializer
               %e = and <$(2N) x i32> %a, %himask
               %f = uitofp <$(2N) x i32> %e to <$(2N) x float>
               %g = fmul <$(2N) x float> %f, %scale
               %h = fadd <$(2N) x float> %d, %g
               %i = fsub <$(2N) x float> %h, %g
               %j = fcmp olt <$(2N) x float> %d, %i
               %k = bitcast <$(2N) x float> %h to <$(2N) x i32>
               %l = sext <$(2N) x i1> %j to <$(2N) x i32>
               %m = add <$(2N) x i32> %l, %k
               %n = bitcast <$(2N) x i32> %m to <$N x i64>
               ret <$N x i64> %n
           """
           @eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = llvmcall($code, $VT, Tuple{$VT}, x)
       end

julia> @benchmark rand!(A)
BenchmarkTools.Trial: 665 samples with 1 evaluation.
 Range (min … max):  7.472 ms …   9.067 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     7.492 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   7.523 ms ± 140.133 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▇█▇▄▁
  █████▇█▇▆▆▄▅▆▅▅▁▅▅▅▄▁▄▆▁▁▄▁▅▁▄▄▄▄▁▄▁▁▁▁▄▄▄▄▁▅▁▁▁▁▁▁▁▁▁▄▁▄▄▅ ▇
  7.47 ms      Histogram: log(frequency) by time      8.14 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

status quo:

julia> A = Array{Float32}(undef, 2^24);

julia> @benchmark rand!(A)
BenchmarkTools.Trial: 949 samples with 1 evaluation.
 Range (min … max):  5.200 ms …  6.274 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.240 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.268 ms ± 83.680 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

     ▃█▅▃▁▂
  ▄▅▅███████▆▇▆▆▆▇▅▆▆▆▄▆▅▇▆▆▅▄▄▅▁▅▄▄▅▄▁▄▄▁▁▄▄▄▁▄▁▁▁▁▄▁▄▁▁▄▁▄ ▇
  5.2 ms       Histogram: log(frequency) by time      5.7 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

After the above patch:

julia> @benchmark rand!(A)
BenchmarkTools.Trial: 665 samples with 1 evaluation.
 Range (min … max):  7.472 ms …   9.067 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     7.492 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   7.523 ms ± 140.133 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▇█▇▄▁
  █████▇█▇▆▆▄▅▆▅▅▁▅▅▅▄▁▄▆▁▁▄▁▅▁▄▄▄▄▁▄▁▁▁▁▄▄▄▄▁▅▁▁▁▁▁▁▁▁▁▄▁▄▄▅ ▇
  7.47 ms      Histogram: log(frequency) by time      8.14 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.
4 Likes