Hi everyone,

I am trying to implement block-level reduction operations in the Cuda Julia kernel.

Does Julia has “cub” kind of device-level functions…?

How can I use the cub library device intrinsics for the following Julia code…?

For example, consider the following example of adding an array of 64 elements in Cuda c using cub. How does the corresponding code look like in Cuda Julia below…?

```
#include <cub/cub.cuh>
#include <cuda.h>
#include <vector>
#include <iostream>
#include <numeric>
#include <cub/cub.cuh>
using namespace std;
using namespace cub;
const unsigned int N=64;
__global__ void kernel(int *array, int *solution)
{
// Specialize BlockReduce type for our thread block where N/2 is number of threads launched
typedef BlockReduce<int, N/2, BLOCK_REDUCE_RAKING> BlockReduceT;
// Shared memory
__shared__ typename BlockReduceT::TempStorage temp_storage;
// Per-thread tile data
int number_of_items_per_thread = 2; // (N/number of threads)
int data[2];
LoadDirectStriped<32>(threadIdx.x, array, data);
// Compute sum
int aggregate = BlockReduceT(temp_storage).Sum(data);
// Store aggregate and elapsed clocks
if (threadIdx.x == 0)
{
*solution = aggregate;
}
}
int main()
{
vector<int> input(N);
iota (std::begin(input), std::end(input), 0);
for (auto i : input)
std::cout << '\t' << i;
std::cout << '\n';
int *device, *sol;
cudaMalloc((void**)&device,N*sizeof(int));
cudaMallocManaged((void**)&sol,sizeof(int));
cudaMemcpy(device,&input[0],N*sizeof(int),cudaMemcpyHostToDevice);
kernel <<<1,N/2>>>(device, sol);
int i = cudaDeviceSynchronize();
cout << "solution\t" << *sol << "\n";
}
```

```
using CUDA
N = 64
function kernel(input, sol)
return
end
function main()
input = CuArray(collect(0:N-1))
println(input)
sol = 0
@cuda blocks =1 threads=32 kernel(input, sol)
synchronize()
println(sol)
end
main()
```