Yes! I got a working version just now.

There were some changes to `gpu_call`

and `global_rng`

.

```
function cu_alias_sample!(a::CuArray{Ta}, wv::AbstractVector{Tw}, x::CuArray{Ta}) where {Tw<:Number, Ta}
length(a) == length(wv) || throw(DimensionMismatch("weight vector must have the same length with label vector"))
n = length(wv)
# create alias table
ap = Vector{Tw}(undef, n)
alias = Vector{Int64}(undef, n)
make_alias_table!(wv, sum(wv), ap, alias)
# to device
alias = CuArray{Int64}(alias)
ap = CuArray{Tw}(ap)
function kernel(state, _, (alias, ap, x, a, randstate))
r1, r2 = GPUArrays.gpu_rand(Float32, state, randstate), GPUArrays.gpu_rand(Float32, state, randstate)
r1 = r1 == 1.0 ? 0.0 : r1
r2 = r2 == 1.0 ? 0.0 : r2
i = linear_index(state)
if i <= length(x)
j = floor(Int, r1 * n) + 1
@inbounds x[i] = r2 < ap[j] ? a[j] : a[alias[j]]
end
return
end
gpu_call(kernel, x, (alias, ap, x, a, GPUArrays.default_rng(typeof(x)).state))
x
end
```

If I can find the time I would like to put together a PR that adds this functionality to CUDA.jl, but I am not sure how to contact the author of that blog.

My speedups aren’t quite as significant as the author’s, but I have a very fast CPU and an old gpu.

```
26.680 μs (75 allocations: 3.73 KiB)
1.345 ms (4 allocations: 640 bytes)
```

(top is CUDA implementation)