# Convert a non-allocating distance transform algorithm to GPU using KernelAbstractions.jl

Hello.

I have been hacking away at this for a few weeks now and still can’t figure out what is going on. I have a non-allocating 1D distance transform (Felzenswalb algorithm) that is easy to parallelize, as you can see below:

``````function transform!(f, output, v,  z)
z = -Inf
z = Inf

k = 1
@inbounds for q in 2:length(f)
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
while s ≤ z[k]
k -= 1
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
end
k += 1
v[k] = q
z[k] = s
z[k + 1] = Inf
end

k = 1
@inbounds for q in 1:length(f)
while z[k + 1] < q
k += 1
end
output[q] = (q - v[k])^2 + f[v[k]]
end
end

function transform!(img::AbstractMatrix, output, v, z; threaded = true)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
else
for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
end
end
``````

I now want to take this 1D transform and parallelize it on the GPU via KernelAbstractions.jl. It seems like this should be very straightforward and I am assuming the 1D transform can be used directly in the 2D kernel, like:

``````@kernel function transform_kernel_matrix_naive(img, output, v, z)
i, j = @index(Global, NTuple)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])

@synchronize

@views transform!(output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform_naive!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel = transform_kernel_matrix_naive(backend)
kernel(img, output, v, z, ndrange = size(img))
KernelAbstractions.synchronize(backend)
end
``````

Unfortunately, this is not actually working. The GPU version doesn’t produce correct results for arrays larger than `size ~ (20, 20)`. I am guessing this has something to do with blocks and threads on the GPU or potentially synchronization issues, but I don’t really know what I am doing, so I am not sure where to go from here. Below is a full MWE

``````using ImageMorphology: distance_transform, feature_transform # to test against
using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray

using KernelAbstractions
using Metal, CUDA, AMDGPU

if CUDA.functional()
@info "Using CUDA"
CUDA.versioninfo()
dev = CuArray
elseif AMDGPU.functional()
@info "Using AMD"
AMDGPU.versioninfo()
dev = ROCArray
elseif Metal.functional()
@info "Using Metal"
Metal.versioninfo()
dev = MtlArray
else
@info "No GPU is available. Using CPU."
end

test_transform(img) = distance_transform(feature_transform(Bool.(img)))
test_transform(img::BitArray) = distance_transform(feature_transform(img))

boolean_indicator(f) = @. ifelse(f == 0, 1f10, 0f0)

# 1D CPU
function transform!(f, output, v,  z)
z = -Inf
z = Inf

k = 1
@inbounds for q in 2:length(f)
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
while s ≤ z[k]
k -= 1
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
end
k += 1
v[k] = q
z[k] = s
z[k + 1] = Inf
end

k = 1
@inbounds for q in 1:length(f)
while z[k + 1] < q
k += 1
end
output[q] = (q - v[k])^2 + f[v[k]]
end
end

function transform!(img::AbstractMatrix, output, v, z; threaded = true)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
else
for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
end
end

# 2D GPU

@kernel function boolean_indicator_kernel(f, output)
i = @index(Global)
output[i] = ifelse(f[i] == 0, 1f10, 0f0)
end

function boolean_indicator(f::AbstractGPUArray)
backend = get_backend(f)
output = similar(f, Float32)
kernel = boolean_indicator_kernel(backend)
kernel(f, output, ndrange=size(f))
KernelAbstractions.synchronize(backend)
return output
end

@kernel function transform_kernel_matrix_naive(img, output, v, z)
i, j = @index(Global, NTuple)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])

@synchronize

@views transform!(output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform_naive!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel = transform_kernel_matrix_naive(backend)
kernel(img, output, v, z, ndrange = size(img), workgroupsize = (16, 16))
KernelAbstractions.synchronize(backend)
end

# Test

## CPU Test 2D - Correct
let
n = 100
img = rand([0f0, 1f0], n, n)

img_bool = boolean_indicator(img)
output = similar(img, Float32)
v = ones(Int32, size(img))
z = ones(Float32, size(img) .+ 1)

transform!(img_bool, output, v, z)
img_test = test_transform(img).^2

@info Array(output) ≈ img_test
output, img_test
end

## GPU Test 2D - Incorrect
let
n = 100
img = rand([0f0, 1f0], n, n)

img_bool = dev(boolean_indicator(img))
output = dev(similar(img, Float32))
v = dev(ones(Int32, size(img)))
z = dev(ones(Float32, size(img) .+ 1))

transform_naive!(img_bool, output, v, z)
img_test = test_transform(img).^2

@info Array(output) ≈ img_test
output, img_test
end
``````

So, (1) am I correct in thinking that the 1D function can be directly utilized inside a kernel for 2D GPU arrays since it is non-allocating and easily parallelizable? If so, (2) what can I do to start debugging this issue?

1 Like

I see that you are aware of the risk of memory races since you split your `@threads` code into two separate for loops with `@threads`.

Doing that you gurantuee that `transform!` ran for all indices before you performed the reads in the second loop.

In your kernel you seem to use `@synchronize` to achieve the same effect.

`@synchronize` has one important caveat! It only synchronizes memory read and writes with respect to a single workgroup! That means you have the potential for memory operations to be miss-ordered.

If you can restructure your code to operate over tiles, then you may have to split the kernel into two seperate kernels.

2 Likes

Does CUDA.jl or KernelAbstractions.jl have any docs where I can read up on this? The idea of tiles is still mysterious to me.

Also, thanks for the reply and your help already! I love all the work you and others have done on the GPU side of things.

A workgroup (or block in CUDA parlance) is a set of workgroupitems/threads being mapped onto a single compute unit.

This means that communication within a workgroup is faster and that they have access to localmem/shared memory.

Synchronization is only with respect to the workgroup, since many workgroups may be scheduled concurrently and doing a synchronization across the entire device is really costly.

1 Like

Okay, I think I made some progress. Is this the direction you were pointing towards? If so, could you help me diagnose what I am missing? Because currently, this works on my computer for matrices less than or equal to `size = (1024, 1024)`. After sizes larger than that I get this error `ArgumentError: Number of threads in group (1025) should not exceed 1024`

``````@kernel function transform_kernel_cols(img, output, v, z)
i = @index(Global)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows(img, output, v, z)
j = @index(Global)
@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

@kernel function copy_kernel!(A, @Const(B))
I = @index(Global)
@inbounds A[I] = B[I]
end

function mycopy!(A, B)
backend = get_backend(A)
@assert size(A) == size(B)
@assert get_backend(B) == backend

kernel = copy_kernel!(backend)
kernel(A, B, ndrange=length(A))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel = transform_kernel_cols(backend)
kernel(img, output, v, z, ndrange = size(img, 1), workgroupsize = size(img, 1))

B = similar(output)
mycopy!(B, output)

kernel2 = transform_kernel_rows(backend)
kernel2(B, output, v, z, ndrange = size(img, 2), workgroupsize = size(img, 2))
KernelAbstractions.synchronize(backend)
end

# Test
let
ns = [10, 64, 100, 128, 1000, 1025]
for n in ns
img = rand([0f0, 1f0], n, n)

img_bool = dev(boolean_indicator(img))
output = dev(similar(img, Float32))
v = dev(ones(Int32, size(img)))
z = dev(ones(Float32, size(img) .+ 1))

transform!(img_bool, output, v, z)
img_test = test_transform(img).^2

@info Array(output) ≈ img_test
output, img_test
end
end
``````

Never mind, this is the fix and this works!

``````function transform!(img::AbstractGPUMatrix, output, v, z; maxworkgroup = 1024)
backend = get_backend(img)
kernel = transform_kernel_cols(backend)
kernel(img, output, v, z, ndrange = size(img, 1), workgroupsize = min(maxworkgroup, size(img, 1)))

B = similar(output)
mycopy!(B, output)

kernel2 = transform_kernel_rows(backend)
kernel2(B, output, v, z, ndrange = size(img, 2), workgroupsize = min(maxworkgroup, size(img, 2)))
KernelAbstractions.synchronize(backend)
end
``````

You could also not provide a workgroupsize and just let KernelAbstractions choose one for you.

Also note that KA has a copyto! function

Wow, that is beautiful. The code looks so similar to the original CPU version now

``````@kernel function transform_kernel_cols(img, output, v, z)
i = @index(Global)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows(img, output, v, z)
j = @index(Global)
@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel_cols = transform_kernel_cols(backend)
kernel_cols(img, output, v, z, ndrange = size(img, 1))

B = similar(output)
copyto!(B, output)

kernel_rows = transform_kernel_rows(backend)
kernel_rows(B, output, v, z, ndrange = size(img, 2))
KernelAbstractions.synchronize(backend)
end
``````
2 Likes