I’m trying to build a tensor `A`

of size `3 x ℓ x m x n`

where `A[:, i, j, k]`

is a 3D coordinate at (i, j, k) in a uniform grid for [0, 1]^3.

On the CPU this code looks like this:

```
function mesh(n)
r = range(0.0f0, 1.0f0, length=n)
grid_tuples = [(x, y, z) for x in r, y in r, z in r]
grid = reshape(reinterpret(Float32, grid_tuples), 3, n, n, n)
return collect(grid)
end
```

I want to construct it on the GPU however to avoid copying gigabytes of data from CPU -> GPU. Is there a simple way to do this?

side note: Apparently this is not slow because of copying things to the GPU, but rather because of reinterpreted arrays being very slow.

I ended up writing the following in case anybody needs a snippet

```
"""
Constructs a tensor A of size 3 × ℓ × m × n on the GPU
where A[:, i, j, k] is the 3D coordinate of grid point
(i, j, k) of the uniform grid of [0, 1]^3 with n grid
points per dimension.
"""
function grid_coords(n)
data = CuArray{Float32}(undef, 3, n, n, n)
function kernel(data::CuDeviceArray)
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
x_div, x_idx = divrem(i - 1, n)
y_div, y_idx = divrem(x_div, n)
z_div, z_idx = divrem(y_div, n)
delta = 1.0f0 / Float32(n)
@inbounds begin
data[1, x_idx + 1, y_idx + 1, z_idx + 1] = (.5f0 + x_idx) * delta
data[2, x_idx + 1, y_idx + 1, z_idx + 1] = (.5f0 + y_idx) * delta
data[3, x_idx + 1, y_idx + 1, z_idx + 1] = (.5f0 + z_idx) * delta
end
return
end
function configurator(kernel)
config = CUDAnative.launch_configuration(kernel.fun)
threads = min(n * n * n, config.threads)
blocks = cld(n * n * n, threads)
return (threads=threads, blocks=blocks)
end
@cuda name="grid_coords" config=configurator kernel(data)
return data
end
```