So I am trying to write an extreme value/ order statistics package for GPUs. The idea is, that getting the maximum of a large number of random variables multiple times in order to get a max distribution should lend itself to parallelization.

But to allow for arbitrary distributions I want my function to accept a pseudoinverse. So I attempted this

```
import CUDA
import CUDA: @cuda
function sample_extreme_values(sampleSize, superSampleSize, pseudoInverse)::Float32
numblocks = ceil(Int, superSampleSize/256)
gpu_res = CUDA.CuArray{Float32}(undef, superSampleSize)
cpu_res = Array{Float32}(undef, superSampleSize)
@cuda threads=256 blocks=numblocks gpu_parallel!(gpu_res, pseudoInverse, sampleSize)
copyto!(cpu_res, gpu_res)
return cpu_res
end
function gpu_parallel!(results, pseudoinverse, sampleSize)
index = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
stride = CUDA.blockDim().x * CUDA.gridDim().x
for thread = index:stride:length(results)
generator = RVGenerator(pseudoinverse, sampleSize)
results[thread] = largest(generator)
end
end
function largest(generator)
largest = Float32(0)
for rv = generator
largest = max(largest, rv)
end
return largest
end
# Implementation of Widynski, Bernard (2020).
# "Squares: A Fast Counter-Based RNG". arXiv:2004.06278v2
# key (seed) taken from keys.h (line 2193) in the distribution of Squares
# see https://squaresrng.wixsite.com/rand
# this distribution also includes a generator for these keys - eventually
# this hardcoded key should be replaced with such a generator
# (one key generates 2^64 random numbers)
key = 0x86d47f132b79acfd
@inline function squares_rng(counter::UInt64, seed::UInt64)::UInt32
yy = counter * seed
z = yy + seed
xx = yy * (yy+1)
# >> arithmetic rightshift, >>> logical rightshift
# (most C Impl.: >> arithm on signed, logical on unsigned)
# << logical/arithmetic leftshift
xx = (xx >>> 32) | (xx << 32)
xx = xx*xx + z
xx = (xx >>> 32) | (xx << 32)
return UInt32((xx*xx + yy) >> 32)
end
struct RVGenerator
pseudoInverse
stop::UInt64
end
function Base.iterate(rvg::RVGenerator, state::UInt64=UInt64(0))
if rvg.stop >= state
return (rvg.pseudoInverse(Float32(squares_rng(state, key))/typemax(UInt32)), state+1)
else
return nothing
end
end
```

which I called like this:

```
sample_extreme_values(100,1000, x->x)
```

But this results in a giant â€śdynamic function invocationâ€ť stacktrace

```
ERROR: InvalidIRError: compiling kernel gpu_parallel!(CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global}, var"#31#32", Int64) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to squares_rng)
Stacktrace:
[1] iterate at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\rngSquares.jl:45
[2] multiple call sites at unknown:0
Reason: unsupported dynamic function invocation
Stacktrace:
[1] iterate at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\rngSquares.jl:45
[2] multiple call sites at unknown:0
Reason: unsupported call to the Julia runtime (call to jl_f_tuple)
Stacktrace:
[1] iterate at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\rngSquares.jl:45
[2] multiple call sites at unknown:0
Reason: unsupported call to the Julia runtime (call to jl_f_getfield)
Stacktrace:
[1] largest at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\OrderStatistics.jl:28
[2] gpu_parallel! at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\OrderStatistics.jl:13
Reason: unsupported dynamic function invocation (call to max)
Stacktrace:
[1] largest at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\OrderStatistics.jl:29
[2] gpu_parallel! at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\OrderStatistics.jl:13
Reason: unsupported dynamic function invocation (call to convert)
Stacktrace:
[1] setindex! at C:\Users\felix\.julia\packages\CUDA\dZvbp\src\device\array.jl:101
[2] gpu_parallel! at D:\Google Drive\CodingPlayground\julia\OrderStatistics\src\OrderStatistics.jl:13
```

I am a bit suprised about this, as I thought that julia would compile julia functions into `__device__`

functions which can then be called within the kernel.

Is that wrong? If yes, what CAN julia do? What can I expect to work? What can I work with?