Here’s the magic to jam @mikmoore’s latest Float32 N=32 version into XoshiroSimd, using a very naive copy-the-code_llvm approach:
magic
Here’s the complete repl session:
▶ ./julia
_
_ _ _(_)_ | Documentation: https://docs.julialang.org
(_) | (_) (_) |
_ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help.
| | | | | | |/ _` | |
| | |_| | | | (_| | | Version 1.11.0-DEV.595 (2023-10-03)
_/ |\__'_|_|_|\__'_| | Commit b790cf8f0c* (26 days old master)
|__/ |
julia> using Random, BenchmarkTools
julia> A = Array{Float32}(undef, 2^24);
julia> @benchmark rand!(A)
BenchmarkTools.Trial: 949 samples with 1 evaluation.
Range (min … max): 5.200 ms … 6.274 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 5.240 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.268 ms ± 83.680 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃█▅▃▁▂
▄▅▅███████▆▇▆▆▆▇▅▆▆▆▄▆▅▇▆▆▅▄▄▅▁▅▄▄▅▄▁▄▄▁▁▄▄▄▁▄▁▁▁▁▄▁▄▁▁▄▁▄ ▇
5.2 ms Histogram: log(frequency) by time 5.7 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @eval Random.XoshiroSimd begin
@inline _bits2float(x::UInt64, ::Type{Float32}) = _f32bits(x)
@inline function _f32bits(u::UInt32)
# map UIntXX linearly into FloatXX on range [0,1)
U = UInt32
F = Base.floattype(U)
bitwidth = 8 * sizeof(U) # bits in type U
lomask = typemax(U) >>> (bitwidth ÷ 2) # mask of low bits of u
scale = F(exp2(-bitwidth))
lo = F(u & lomask) * scale # must scale here for UInt16, because Float16(typemax(UInt16)) overflows
hi = F(u & ~lomask) * scale # must scale here for UInt16, because Float16(typemax(UInt16)) overflows
f0 = lo + hi # full-width value
efflo = f0 - hi # get effective value of lo used
roundedup = efflo > lo # determine whether f0 was rounded upward
fu = reinterpret(U, f0)
fu -= roundedup # if we rounded up, decrement to round down instead
return fu
end
@inline function _f32bits(x::UInt64)
ui = (x>>>32) % UInt32
li = x % UInt32
return (UInt64(_f32bits(ui)) << 32) | UInt64(_f32bits(li))
end
@inline _f32bits(x::VecElement{UInt64}) = _f32bits(x.value)
end
for N in [4,8,16]
VT = :(NTuple{$N, VecElement{UInt64}})
code = """
%a = bitcast <$N x i64> %0 to <$(2N) x i32>
%lowmask = shufflevector <1 x i32> <i32 65535>, <1 x i32> undef, <$(2N) x i32> zeroinitializer
%b = and <$(2N) x i32> %a, %lowmask
%c = uitofp <$(2N) x i32> %b to <$(2N) x float>
%scale = shufflevector <1 x float> <float 0x3DF0000000000000>, <1 x float> undef, <$(2N) x i32> zeroinitializer
%d = fmul <$(2N) x float> %c, %scale
%himask = shufflevector <1 x i32> <i32 -65536>, <1 x i32> undef, <$(2N) x i32> zeroinitializer
%e = and <$(2N) x i32> %a, %himask
%f = uitofp <$(2N) x i32> %e to <$(2N) x float>
%g = fmul <$(2N) x float> %f, %scale
%h = fadd <$(2N) x float> %d, %g
%i = fsub <$(2N) x float> %h, %g
%j = fcmp olt <$(2N) x float> %d, %i
%k = bitcast <$(2N) x float> %h to <$(2N) x i32>
%l = sext <$(2N) x i1> %j to <$(2N) x i32>
%m = add <$(2N) x i32> %l, %k
%n = bitcast <$(2N) x i32> %m to <$N x i64>
ret <$N x i64> %n
"""
@eval Random.XoshiroSimd @inline _bits2float(x::$VT, ::Type{Float32}) = llvmcall($code, $VT, Tuple{$VT}, x)
end
julia> @benchmark rand!(A)
BenchmarkTools.Trial: 665 samples with 1 evaluation.
Range (min … max): 7.472 ms … 9.067 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 7.492 ms ┊ GC (median): 0.00%
Time (mean ± σ): 7.523 ms ± 140.133 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▇█▇▄▁
█████▇█▇▆▆▄▅▆▅▅▁▅▅▅▄▁▄▆▁▁▄▁▅▁▄▄▄▄▁▄▁▁▁▁▄▄▄▄▁▅▁▁▁▁▁▁▁▁▁▄▁▄▄▅ ▇
7.47 ms Histogram: log(frequency) by time 8.14 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
status quo:
julia> A = Array{Float32}(undef, 2^24);
julia> @benchmark rand!(A)
BenchmarkTools.Trial: 949 samples with 1 evaluation.
Range (min … max): 5.200 ms … 6.274 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 5.240 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.268 ms ± 83.680 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃█▅▃▁▂
▄▅▅███████▆▇▆▆▆▇▅▆▆▆▄▆▅▇▆▆▅▄▄▅▁▅▄▄▅▄▁▄▄▁▁▄▄▄▁▄▁▁▁▁▄▁▄▁▁▄▁▄ ▇
5.2 ms Histogram: log(frequency) by time 5.7 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
After the above patch:
julia> @benchmark rand!(A)
BenchmarkTools.Trial: 665 samples with 1 evaluation.
Range (min … max): 7.472 ms … 9.067 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 7.492 ms ┊ GC (median): 0.00%
Time (mean ± σ): 7.523 ms ± 140.133 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▇█▇▄▁
█████▇█▇▆▆▄▅▆▅▅▁▅▅▅▄▁▄▆▁▁▄▁▅▁▄▄▄▄▁▄▁▁▁▁▄▄▄▄▁▅▁▁▁▁▁▁▁▁▁▄▁▄▄▅ ▇
7.47 ms Histogram: log(frequency) by time 8.14 ms <
Memory estimate: 0 bytes, allocs estimate: 0.