Regarding the RNG point, VectorizedRNG
should be faster.
julia> x = rand(Float32, n, n);
julia> @btime rand!(local_rng(), $x);
7.223 μs (0 allocations: 0 bytes)
julia> @btime $x .= rand.();
323.120 μs (0 allocations: 0 bytes)
But it makes little difference compared to matrix multiplication:
julia> n = 300; k = 30; tmp2=zeros(Float32,n*k,n*k);
julia> function test_fast_rng(n,k,tmp2)
t = Matrix{eltype(tmp2)}(undef,n,n)
tmp1 = similar(t); rng = local_rng();
for i in 1:k
rand!(rng, t)
for j in 1:k
rand!(rng, tmp1)
buf = @view(tmp2[(i-1)*n+1:i*n,(j-1)*n+1:j*n])
matmul!(buf,t,tmp1)
end
end
end
test_fast_rng (generic function with 1 method)
julia> @btime test_fast($n,$k,$tmp2)
33.926 ms (4 allocations: 703.28 KiB)
julia> @btime test_fast_rng($n,$k,$tmp2)
33.556 ms (4 allocations: 703.28 KiB)
Also, because this is Julia, we can thread our loops, and not just BLAS:
julia> using CheapThreads, Octavian, BenchmarkTools
julia> using Random, VectorizedRNG
julia> function test_batch(n,k,tmp2)
@batch for i in 1:k
t = Matrix{eltype(tmp2)}(undef, n, n);
tmp1 = similar(t);
rng = local_rng()
rand!(rng, t)
for j in 1:k
rand!(rng, tmp1)
buf = @view(tmp2[(j-1)*n+1:j*n,(i-1)*n+1:i*n])
matmul_serial!(buf,t,tmp1)
end
end
end
test_batch (generic function with 1 method)
julia> n = 300; k = 30; tmp2=zeros(Float32,n*k,n*k);
julia> @btime test_batch($n,$k,$tmp2)
20.605 ms (125 allocations: 20.60 MiB)
Edit:
Also, 32 vs 64 bit BLAS is about the size of the integers. MKL and OpenBLAS both handle Float32
and Float64
whether they’re 32 bit or 64 bit builds.
But Octavian handles multiplying Int32
and Int64
, while those BLAS libraries don’t.
For singing more exotic, it takes just a few extensions to get a TropicalGEMM, where *
is +
and +
is max
.