Hello.
I have been hacking away at this for a few weeks now and still can’t figure out what is going on. I have a non-allocating 1D distance transform (Felzenswalb algorithm) that is easy to parallelize, as you can see below:
function transform!(f, output, v, z)
z[1] = -Inf
z[2] = Inf
k = 1
@inbounds for q in 2:length(f)
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
while s ≤ z[k]
k -= 1
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
end
k += 1
v[k] = q
z[k] = s
z[k + 1] = Inf
end
k = 1
@inbounds for q in 1:length(f)
while z[k + 1] < q
k += 1
end
output[q] = (q - v[k])^2 + f[v[k]]
end
end
function transform!(img::AbstractMatrix, output, v, z; threaded = true)
if threaded
Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end
Threads.@threads for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
else
for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end
for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
end
end
I now want to take this 1D transform and parallelize it on the GPU via KernelAbstractions.jl. It seems like this should be very straightforward and I am assuming the 1D transform can be used directly in the 2D kernel, like:
@kernel function transform_kernel_matrix_naive(img, output, v, z)
i, j = @index(Global, NTuple)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
@synchronize
@views transform!(output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end
function transform_naive!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel = transform_kernel_matrix_naive(backend)
kernel(img, output, v, z, ndrange = size(img))
KernelAbstractions.synchronize(backend)
end
Unfortunately, this is not actually working. The GPU version doesn’t produce correct results for arrays larger than size ~ (20, 20)
. I am guessing this has something to do with blocks and threads on the GPU or potentially synchronization issues, but I don’t really know what I am doing, so I am not sure where to go from here. Below is a full MWE
using ImageMorphology: distance_transform, feature_transform # to test against
using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray
using KernelAbstractions
using Metal, CUDA, AMDGPU
if CUDA.functional()
@info "Using CUDA"
CUDA.versioninfo()
dev = CuArray
elseif AMDGPU.functional()
@info "Using AMD"
AMDGPU.versioninfo()
dev = ROCArray
elseif Metal.functional()
@info "Using Metal"
Metal.versioninfo()
dev = MtlArray
else
@info "No GPU is available. Using CPU."
end
test_transform(img) = distance_transform(feature_transform(Bool.(img)))
test_transform(img::BitArray) = distance_transform(feature_transform(img))
boolean_indicator(f) = @. ifelse(f == 0, 1f10, 0f0)
# 1D CPU
function transform!(f, output, v, z)
z[1] = -Inf
z[2] = Inf
k = 1
@inbounds for q in 2:length(f)
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
while s ≤ z[k]
k -= 1
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
end
k += 1
v[k] = q
z[k] = s
z[k + 1] = Inf
end
k = 1
@inbounds for q in 1:length(f)
while z[k + 1] < q
k += 1
end
output[q] = (q - v[k])^2 + f[v[k]]
end
end
# 2D CPU (threaded and non-threaded)
function transform!(img::AbstractMatrix, output, v, z; threaded = true)
if threaded
Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end
Threads.@threads for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
else
for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end
for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
end
end
# 2D GPU
@kernel function boolean_indicator_kernel(f, output)
i = @index(Global)
output[i] = ifelse(f[i] == 0, 1f10, 0f0)
end
function boolean_indicator(f::AbstractGPUArray)
backend = get_backend(f)
output = similar(f, Float32)
kernel = boolean_indicator_kernel(backend)
kernel(f, output, ndrange=size(f))
KernelAbstractions.synchronize(backend)
return output
end
@kernel function transform_kernel_matrix_naive(img, output, v, z)
i, j = @index(Global, NTuple)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
@synchronize
@views transform!(output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end
function transform_naive!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel = transform_kernel_matrix_naive(backend)
kernel(img, output, v, z, ndrange = size(img), workgroupsize = (16, 16))
KernelAbstractions.synchronize(backend)
end
# Test
## CPU Test 2D - Correct
let
n = 100
img = rand([0f0, 1f0], n, n)
img_bool = boolean_indicator(img)
output = similar(img, Float32)
v = ones(Int32, size(img))
z = ones(Float32, size(img) .+ 1)
transform!(img_bool, output, v, z)
img_test = test_transform(img).^2
@info Array(output) ≈ img_test
output, img_test
end
## GPU Test 2D - Incorrect
let
n = 100
img = rand([0f0, 1f0], n, n)
img_bool = dev(boolean_indicator(img))
output = dev(similar(img, Float32))
v = dev(ones(Int32, size(img)))
z = dev(ones(Float32, size(img) .+ 1))
transform_naive!(img_bool, output, v, z)
img_test = test_transform(img).^2
@info Array(output) ≈ img_test
output, img_test
end
So, (1) am I correct in thinking that the 1D function can be directly utilized inside a kernel for 2D GPU arrays since it is non-allocating and easily parallelizable? If so, (2) what can I do to start debugging this issue?