First of all, note that InverseWishart sampling is slower than Wishart sampling!

Wisharts can be sampled via the Bartlett’s decomposition, as I do below. Simply multiplying the decomposition by itself generates the Wishart. For the InverseWishart, you also need to invert the matrix.

That is my approach below. I also use a different (faster) gamma random sampler than that provided by `Distributions.jl`

, to generate the diagonal elements of the Bartlett decomposition.

First, baseline time:

```
julia> using BenchmarkTools, Distributions, LinearAlgebra
julia> @benchmark rand(InverseWishart(20, Matrix{Float64}(I,20,20)))
BenchmarkTools.Trial:
memory estimate: 33.50 KiB
allocs estimate: 48
--------------
minimum time: 46.890 μs (0.00% GC)
median time: 56.950 μs (0.00% GC)
mean time: 64.514 μs (10.63% GC)
maximum time: 46.167 ms (99.76% GC)
--------------
samples: 10000
evals/sample: 1
```

Now, a faster inverse Wishart:

```
using Random, LinearAlgebra
@inline randgamma(α) = randgamma(Random.GLOBAL_RNG, α)
@inline function randgamma(rng::AbstractRNG, α::T) where T
α < one(T) ? exp(-randexp(rng, T)/α) * randgamma_g1(rng, α+one(T)) : randgamma_g1(rng, α)
end
@inline function randgamma_g1(rng::AbstractRNG, α::T) where {T}
OneThird = one(T)/T(3)
d = α - OneThird
@fastmath c = OneThird / sqrt(d)
@fastmath while true
x = randn(rng, T)
v = one(T) + c*x
v < zero(T) && continue
v3 = v^3
dv3 = d*v3
randexp(rng, T) > T(-0.5)*x^2 - d + dv3 - d*log(v3) && return dv3
end
end
@inline randchisq(rng::AbstractRNG, ν::T) where {T} = T(2.0) * randgamma(rng, T(0.5)ν)
@inline randchisq(ν::T) where {T} = T(2.0) * randgamma(T(0.5)ν)
@inline randchisq(rng::AbstractRNG, ν::T) where {T <: Integer} = T(2) * randgamma(rng, ν/2)
@inline randchisq(ν::T) where {T <: Integer} = T(2) * randgamma(ν/2)
@inline randchi(rng::AbstractRNG, ν) = @fastmath sqrt(randchisq(rng, ν))
@inline randchi(ν) = @fastmath sqrt(randchisq(ν))
function randinversewishart(ν, N, Σ::UniformScaling{Bool})
I.λ == false && return zeros(N,N)
out = Matrix{Float64}(undef, N, N)
for nc ∈ 1:N
out[nc,nc] = randchi(ν - nc + 1)
for nr ∈ nc+1:N
out[nr, nc] = randn()
end
end
Symmetric(LinearAlgebra.LAPACK.potri!('L', out), :L)
end
```

and a Wishart:

```
function randwishart(ν, N, Σ::UniformScaling)
A = zeros(N, N)
for nc ∈ 1:N
A[nc,nc] = randchi(ν - nc + 1)
for nr ∈ nc+1:N
A[nr, nc] = randn()
end
end
W = Matrix{Float64}(undef, N, N)
LinearAlgebra.BLAS.syrk!('L', 'N', Float64(I.λ), A, 0.0, W)
Symmetric(W, :L)
end
```

These can be optimized further, but we already get a decent speed boost, without needing anything outside of the standard library:

```
julia> @benchmark randinversewishart(20, 20, I)
BenchmarkTools.Trial:
memory estimate: 4.22 KiB
allocs estimate: 62
--------------
minimum time: 10.810 μs (0.00% GC)
median time: 15.220 μs (0.00% GC)
mean time: 21.230 μs (27.15% GC)
maximum time: 46.320 ms (99.92% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark randwishart(20, 20, I)
BenchmarkTools.Trial:
memory estimate: 7.47 KiB
allocs estimate: 63
--------------
minimum time: 5.357 μs (0.00% GC)
median time: 5.940 μs (0.00% GC)
mean time: 7.267 μs (16.55% GC)
maximum time: 7.478 ms (99.85% GC)
--------------
samples: 10000
evals/sample: 6
```

Or, running the `timing`

functions:

```
julia> function timing1(samples)
@time for i in 1:samples
inv_wish = rand(InverseWishart(20, Matrix{Float64}(I,20,20)))
end
end
timing1 (generic function with 1 method)
julia> timing1(50000)
3.008724 seconds (3.03 M allocations: 1.607 GiB, 2.48% gc time)
julia> function timing2(samples)
@time for i in 1:samples
inv_wish = randinversewishart(20, 20, I)
end
end
timing2 (generic function with 1 method)
julia> timing2(50000)
0.789239 seconds (3.10 M allocations: 205.994 MiB, 1.43% gc time)
julia> function timing3(samples)
@time for i in 1:samples
inv_wish = randwishart(20, 20, I)
end
end
timing3 (generic function with 1 method)
julia> timing3(50000)
0.316246 seconds (3.15 M allocations: 364.685 MiB, 5.36% gc time)
```