Could you post a full MWE?
One issue I found is that @Const doesn’t work if data is an OffsetArray.
using KernelAbstractions
using AMDGPU
using ROCKernels
using OffsetArrays
@kernel function diffuse_kabs_lmem!(out, data, a, dt, dx, dy)
i, j = @index(Global, NTuple)
li, lj = @index(Local, NTuple)
lmem = @localmem eltype(data) (@groupsize()[1] + 2, @groupsize()[2] + 2)
@uniform ldata = OffsetArray(lmem, 0:(@groupsize()[1]+1), 0:(@groupsize()[1]+1))
@inbounds begin
ldata[li,lj] = data[i,j]
if li == 1
ldata[li-1,lj] = data[i-1,j]
end
if li == @groupsize()[1]
ldata[li+1,lj] = data[i+1,j]
end
if lj == 1
ldata[li,lj-1] = data[i,j-1]
end
if lj == @groupsize()[2]
ldata[li,lj+1] = data[i,j+1]
end
end
@synchronize()
@inbounds begin
dij = ldata[li,lj]
dim1j = ldata[li-1,lj]
dijm1 = ldata[li,lj-1]
dip1j = ldata[li+1,lj]
dijp1 = ldata[li,lj+1]
dij += a * dt * (
(dim1j - 2 * dij + dip1j)/dx^2 +
(dijm1 - 2 * dij + dijp1)/dy^2)
out[i,j] = dij
end
end
diffusion_kernel_lmem = diffuse_kabs_lmem!(ROCDevice(), (16, 16))
N = 64
domain = OffsetArray(AMDGPU.zeros(N+2, N+2), 0:(N+1), 0:(N+1))
out = AMDGPU.zeros(N, N)
wait(diffusion_kernel_lmem(out, domain, 0.01, 0.01, 0.01, 0.01; ndrange=(N,N)))
Works for me using the AMD backend.