Convert a non-allocating distance transform algorithm to GPU using KernelAbstractions.jl

Dale_James_Black · November 2, 2023, 9:04pm

Hello.

I have been hacking away at this for a few weeks now and still can’t figure out what is going on. I have a non-allocating 1D distance transform (Felzenswalb algorithm) that is easy to parallelize, as you can see below:

function transform!(f, output, v,  z)
    z[1] = -Inf
    z[2] = Inf

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≤ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end

function transform!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

I now want to take this 1D transform and parallelize it on the GPU via KernelAbstractions.jl. It seems like this should be very straightforward and I am assuming the 1D transform can be used directly in the 2D kernel, like:

@kernel function transform_kernel_matrix_naive(img, output, v, z)
	i, j = @index(Global, NTuple)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])

	@synchronize

	@views transform!(output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform_naive!(img::AbstractGPUMatrix, output, v, z)
    backend = get_backend(img)
    kernel = transform_kernel_matrix_naive(backend)
    kernel(img, output, v, z, ndrange = size(img))
    KernelAbstractions.synchronize(backend)
end

Unfortunately, this is not actually working. The GPU version doesn’t produce correct results for arrays larger than size ~ (20, 20). I am guessing this has something to do with blocks and threads on the GPU or potentially synchronization issues, but I don’t really know what I am doing, so I am not sure where to go from here. Below is a full MWE

using ImageMorphology: distance_transform, feature_transform # to test against
using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray

using KernelAbstractions
using Metal, CUDA, AMDGPU

if CUDA.functional()
	@info "Using CUDA"
	CUDA.versioninfo()
	dev = CuArray
elseif AMDGPU.functional()
	@info "Using AMD"
	AMDGPU.versioninfo()
	dev = ROCArray
elseif Metal.functional()
	@info "Using Metal"
	Metal.versioninfo()
	dev = MtlArray
else
    @info "No GPU is available. Using CPU."
end


test_transform(img) = distance_transform(feature_transform(Bool.(img)))
test_transform(img::BitArray) = distance_transform(feature_transform(img))

boolean_indicator(f) = @. ifelse(f == 0, 1f10, 0f0)

# 1D CPU
function transform!(f, output, v,  z)
    z[1] = -Inf
    z[2] = Inf

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≤ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end

# 2D CPU (threaded and non-threaded)
function transform!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

# 2D GPU

@kernel function boolean_indicator_kernel(f, output)
    i = @index(Global)
    output[i] = ifelse(f[i] == 0, 1f10, 0f0)
end

function boolean_indicator(f::AbstractGPUArray)
    backend = get_backend(f)
    output = similar(f, Float32)
    kernel = boolean_indicator_kernel(backend)
    kernel(f, output, ndrange=size(f))
    KernelAbstractions.synchronize(backend)
    return output
end

@kernel function transform_kernel_matrix_naive(img, output, v, z)
	i, j = @index(Global, NTuple)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])

	@synchronize

	@views transform!(output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform_naive!(img::AbstractGPUMatrix, output, v, z)
    backend = get_backend(img)
    kernel = transform_kernel_matrix_naive(backend)
    kernel(img, output, v, z, ndrange = size(img), workgroupsize = (16, 16))
    KernelAbstractions.synchronize(backend)
end

# Test

## CPU Test 2D - Correct
let
    n = 100
    img = rand([0f0, 1f0], n, n)
	
    img_bool = boolean_indicator(img)
	output = similar(img, Float32)
	v = ones(Int32, size(img))
	z = ones(Float32, size(img) .+ 1)
	
	transform!(img_bool, output, v, z)
    img_test = test_transform(img).^2
	
    @info Array(output) ≈ img_test
	output, img_test
end

## GPU Test 2D - Incorrect
let
    n = 100
    img = rand([0f0, 1f0], n, n)
	
    img_bool = dev(boolean_indicator(img))
	output = dev(similar(img, Float32))
	v = dev(ones(Int32, size(img)))
	z = dev(ones(Float32, size(img) .+ 1))
	
	transform_naive!(img_bool, output, v, z)
    img_test = test_transform(img).^2
	
    @info Array(output) ≈ img_test
	output, img_test
end

So, (1) am I correct in thinking that the 1D function can be directly utilized inside a kernel for 2D GPU arrays since it is non-allocating and easily parallelizable? If so, (2) what can I do to start debugging this issue?

vchuravy · November 3, 2023, 4:00pm

I see that you are aware of the risk of memory races since you split your @threads code into two separate for loops with @threads.

Doing that you gurantuee that transform! ran for all indices before you performed the reads in the second loop.

In your kernel you seem to use @synchronize to achieve the same effect.

@synchronize has one important caveat! It only synchronizes memory read and writes with respect to a single workgroup! That means you have the potential for memory operations to be miss-ordered.

If you can restructure your code to operate over tiles, then you may have to split the kernel into two seperate kernels.

Dale_James_Black · November 3, 2023, 5:00pm

Does CUDA.jl or KernelAbstractions.jl have any docs where I can read up on this? The idea of tiles is still mysterious to me.

Also, thanks for the reply and your help already! I love all the work you and others have done on the GPU side of things.

vchuravy · November 3, 2023, 6:15pm

A workgroup (or block in CUDA parlance) is a set of workgroupitems/threads being mapped onto a single compute unit.

This means that communication within a workgroup is faster and that they have access to localmem/shared memory.

Synchronization is only with respect to the workgroup, since many workgroups may be scheduled concurrently and doing a synchronization across the entire device is really costly.

Dale_James_Black · November 7, 2023, 6:35pm

Okay, I think I made some progress. Is this the direction you were pointing towards? If so, could you help me diagnose what I am missing? Because currently, this works on my computer for matrices less than or equal to size = (1024, 1024). After sizes larger than that I get this error ArgumentError: Number of threads in group (1025) should not exceed 1024

@kernel function transform_kernel_cols(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

@kernel function copy_kernel!(A, @Const(B))
    I = @index(Global)
    @inbounds A[I] = B[I]
end

function mycopy!(A, B)
    backend = get_backend(A)
    @assert size(A) == size(B)
    @assert get_backend(B) == backend

    kernel = copy_kernel!(backend)
    kernel(A, B, ndrange=length(A))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
    backend = get_backend(img)
    kernel = transform_kernel_cols(backend)
    kernel(img, output, v, z, ndrange = size(img, 1), workgroupsize = size(img, 1))
	
    B = similar(output)
    mycopy!(B, output)
    
    kernel2 = transform_kernel_rows(backend)
    kernel2(B, output, v, z, ndrange = size(img, 2), workgroupsize = size(img, 2))
    KernelAbstractions.synchronize(backend)
end

# Test
let
    ns = [10, 64, 100, 128, 1000, 1025]
	for n in ns
	    img = rand([0f0, 1f0], n, n)
		
	    img_bool = dev(boolean_indicator(img))
		output = dev(similar(img, Float32))
		v = dev(ones(Int32, size(img)))
		z = dev(ones(Float32, size(img) .+ 1))
		
		transform!(img_bool, output, v, z)
	    img_test = test_transform(img).^2
		
	    @info Array(output) ≈ img_test
		output, img_test
	end
end

Dale_James_Black · November 7, 2023, 6:41pm

Never mind, this is the fix and this works!

function transform!(img::AbstractGPUMatrix, output, v, z; maxworkgroup = 1024)
    backend = get_backend(img)
    kernel = transform_kernel_cols(backend)
    kernel(img, output, v, z, ndrange = size(img, 1), workgroupsize = min(maxworkgroup, size(img, 1)))
	
    B = similar(output)
    mycopy!(B, output)
    
    kernel2 = transform_kernel_rows(backend)
    kernel2(B, output, v, z, ndrange = size(img, 2), workgroupsize = min(maxworkgroup, size(img, 2)))
    KernelAbstractions.synchronize(backend)
end

vchuravy · November 7, 2023, 7:13pm

You could also not provide a workgroupsize and just let KernelAbstractions choose one for you.

Also note that KA has a copyto! function

Dale_James_Black · November 7, 2023, 7:21pm

Wow, that is beautiful. The code looks so similar to the original CPU version now

@kernel function transform_kernel_cols(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
    backend = get_backend(img)
    kernel_cols = transform_kernel_cols(backend)
    kernel_cols(img, output, v, z, ndrange = size(img, 1))
	
    B = similar(output)
    copyto!(B, output)
    
    kernel_rows = transform_kernel_rows(backend)
    kernel_rows(B, output, v, z, ndrange = size(img, 2))
    KernelAbstractions.synchronize(backend)
end

Topic		Replies	Views
GPU kernel that is ~20x slower than corresponding CPU version GPU	9	727	November 10, 2023
How should I approach GPU programming? GPU	27	2800	April 17, 2020
Converting an image rotation demo from JuliaCon 2021 (AMD -> NVIDIA) GPU cuda	3	393	August 29, 2022
Writing simple kernel functions (EM algorithm) GPU first-steps	7	1726	November 28, 2019
How to use distance_transform in Images.jl Visualization	2	1007	December 21, 2018

Convert a non-allocating distance transform algorithm to GPU using KernelAbstractions.jl

Related topics