This post is about sampling binomial random variates on the GPU
Thanks for mentioning this reference, I hadn’t looked at it.
Yes, the binomial samplers tend to look a bit scary, this one especially.
For what it’s worth, here is a completely naive algorithm (summing bernoulli samples). I should say that for my application I want to sample a matrix whose i’th row shares the same parameter p[i], but other variants should be easy to accommodate. Using the same design as for myrand
above,
function kernel_binomial_naive!(n, k, p, r)
index1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride1 = blockDim().x * gridDim().x
index2 = (blockIdx().y - 1) * blockDim().y + threadIdx().y
stride2 = blockDim().y * gridDim().y
for i in index1:stride1:size(k, 1), j in index2:stride2:size(k, 2)
k[i, j] = 0
for m in 1:n[i,j]
@inbounds k[i,j] += gpu_rand(Float32, CuKernelContext(), r) < p[i]
end
end
return
end
function rand_binomial_naive(n, p)
m1, m2 = size(n)
k = CUDA.zeros(m1, m2)
numblocks1 = ceil(Int, m1/16)
numblocks2 = ceil(Int, m2/16)
r = default_rng(CuArray{Float32, 2})
begin
@cuda threads=(16, 16) blocks=(numblocks1, numblocks2) kernel_binomial_naive!(n, k, p, r.state)
end
return k
end
It takes about 12ms to sample an 1024x1024 array. This is actually competitive with PyTorch (approx 11ms, if I’m measuring correctly),
# Python 3
import torch
shape = [1024, 1024]
counts = torch.cuda.FloatTensor([[128.] * shape[0]] * shape[1])
probs = torch.rand([shape[0]], device='cuda:0')
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
samples = torch.binomial(count = counts, prob = probs)
end.record()
torch.cuda.synchronize()
print(start.elapsed_time(end))
which uses the more advanced BTRS algorithm.
Note that because the n parameter may be different for each sample, table methods which are fast if you need many samples from the same distribution are not quite appropriate for my use case.
I implemented the similar BTRD algorithm (taken almost straight from tensorflow) below, which is about 30% faster and therefore faster than PyTorch. My goal is now to find out how much faster I can get.
xt1mx(x) = x*(1-x)
xd1mx(x) = x/(1-x)
function stirling_approx_tail(k)
if k == 0
return 0.0810614667953272
elseif k == 1
return 0.0413406959554092
elseif k == 2
return 0.0276779256849983
elseif k == 3
return 0.0207906721037650
elseif k == 4
return 0.0166446911898211
elseif k == 5
return 0.0138761288230707
elseif k == 6
return 0.0118967099458917
elseif k == 7
return 0.0104112652619720
elseif k == 8
return 0.00925546218271273
elseif k == 9
return 0.00833056343336287
end
kp1sq = (k + 1) * (k + 1);
return (1.0 / 12 - (1.0 / 360 - 1.0 / 1260 / kp1sq) / kp1sq) / (k + 1)
end
function kernel_BTRD!(n, k, p, randstates)
index1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride1 = blockDim().x * gridDim().x
index2 = (blockIdx().y - 1) * blockDim().y + threadIdx().y
stride2 = blockDim().y * gridDim().y
@inbounds for i in index1:stride1:size(k, 1)
# BTRD approximations work well for p <= 0.5
pp = min(p[i], 1-p[i])
# edge case
if pp == 0
for j in index2:stride2:size(k, 2)
k[i, j] = 0
end
continue
end
# general case
logp = log(1-pp)
r = xd1mx(pp)
s = xt1mx(pp)
@inbounds for j in index2:stride2:size(k, 2)
# trivial case
if n[i, j] == 0
k[i, j] = 0
continue
end
# Use inversion algorithm for n*p < 10
if n[i, j] * pp < 10
geom_sum = 0.
num_geom = 0
while true
geom = ceil(log(gpu_rand(Float32, CuKernelContext(), randstates)) / logp)
geom_sum += geom
if geom_sum > n[i, j]
break
end
num_geom += 1
end
k[i, j] = num_geom
continue
end
# BTRD algorithm
k[i,j] = -1
stddev = sqrt(n[i,j] * s)
b = 1.15 + 2.53 * stddev
a = -0.0873 + 0.0248 * b + 0.01 * pp
c = n[i,j] * pp + 0.5
v_r = 0.92 - 4.2 / b
alpha = (2.83 + 5.1 / b) * stddev;
m = floor((n[i,j] + 1) * pp)
while true
if k[i, j] >= 0
break
end
usample = gpu_rand(Float32, CuKernelContext(), randstates) - 0.5
vsample = gpu_rand(Float32, CuKernelContext(), randstates)
us = 0.5 - abs(usample)
ks = floor((2 * a / us + b) * usample + c)
if us >= 0.07 && vsample <= v_r
k[i, j] = ks
end
if ks < 0 || ks > n[i, j]
continue
end
v2 = log(vsample * alpha / (a / (us * us) + b))
ub = (m + 0.5) * log((m + 1) / (r * (n[i, j] - m + 1))) +
(n[i, j] + 1) * log((n[i, j] - m + 1) / (n[i, j] - ks + 1)) +
(ks + 0.5) * log(r * (n[i, j] - ks + 1) / (ks + 1)) +
stirling_approx_tail(m) + stirling_approx_tail(n[i, j] - m) - stirling_approx_tail(ks) - stirling_approx_tail(n[i, j] - ks)
if v2 <= ub
k[i, j] = ks
end
end
k[i,j] = relu(k[i,j])
# if pp = 1 - p[i] was used, do k <- n - k
if pp > 0.5
k[i, j] = n[i, j] - k[i, j]
end
end
end
return
end
function rand_binomial_BTRD(n, p)
m1, m2 = size(n)
k = CUDA.zeros(m1, m2)
numblocks1 = ceil(Int, m1/16)
numblocks2 = ceil(Int, m2/16)
r = default_rng(CuArray{Float32, 2})
begin
@cuda threads=(16, 16) blocks=(numblocks1, numblocks2) kernel_BTRD!(n, k, p, r.state)
end
return k
end