I have a 2D distance transform algorithm on the GPU that is around ~20x slower than its corresponding CPU version. I donβt know if this is an issue with the way the kernel was written or potentially to be expected on Metal.jl? Any hints would be helpful, especially if you see something about the kernel itself, since idk much about profiling.
1D non-allocating base function
function transform!(f::AbstractVector, output, v, z)
z[1] = -Inf32
z[2] = Inf32
k = 1
@inbounds for q in 2:length(f)
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
while s β€ z[k]
k -= 1
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
end
k += 1
v[k] = q
z[k] = s
z[k + 1] = Inf32
end
k = 1
@inbounds for q in 1:length(f)
while z[k + 1] < q
k += 1
end
output[q] = (q - v[k])^2 + f[v[k]]
end
end
2D GPU Kernel
@kernel function transform_kernel_cols!(img, output, v, z)
i = @index(Global)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end
@kernel function transform_kernel_rows!(img, output, v, z)
j = @index(Global)
@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end
function transform!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel_cols = transform_kernel_cols!(backend)
kernel_cols(img, output, v, z, ndrange = size(img, 1))
B = similar(output)
copyto!(B, output)
kernel_rows = transform_kernel_rows!(backend)
kernel_rows(B, output, v, z, ndrange = size(img, 2))
KernelAbstractions.synchronize(backend)
end
2D CPU version
function transform!(img::AbstractMatrix, output, v, z; threaded = true)
if threaded
Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end
Threads.@threads for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
else
for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end
for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end
end
end
Timing results
# CPU
let
n = 100
img = rand([0f0, 1f0], n, n)
img_bool = boolean_indicator(img)
output = similar(img, eltype(img))
v = ones(Int32, size(img))
z = ones(eltype(img), size(img) .+ 1)
tfm = @benchmark transform!($img_bool, $output, $v, $z)
end
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min β¦ max): 43.417 ΞΌs β¦ 3.865 ms β GC (min β¦ max): 0.00% β¦ 94.38%
Time (median): 64.750 ΞΌs β GC (median): 0.00%
Time (mean Β± Ο): 70.336 ΞΌs Β± 45.136 ΞΌs β GC (mean Β± Ο): 0.52% Β± 0.94%
βββ
ββββ
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
43.4 ΞΌs Histogram: frequency by time 199 ΞΌs <
Memory estimate: 4.59 KiB, allocs estimate: 47.
# GPU
let
n = 100
img = rand!(KernelAbstractions.allocate(backend, Float32, 10, 10)) .> 0.5f0
img_bool = boolean_indicator(img)
output = similar(img, Float32)
v = KernelAbstractions.ones(backend, Int32, size(img))
z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
tfm = @benchmark transform!($img_bool, $output, $v, $z)
end
BenchmarkTools.Trial: 4069 samples with 1 evaluation.
Range (min β¦ max): 644.625 ΞΌs β¦ 26.387 ms β GC (min β¦ max): 0.00% β¦ 62.32%
Time (median): 1.145 ms β GC (median): 0.00%
Time (mean Β± Ο): 1.224 ms Β± 595.827 ΞΌs β GC (mean Β± Ο): 0.33% Β± 0.98%
βββββββ
βββββββββββ
ββββ
βββββββββββββββββββββββ
β
β
βββ
β
β
ββ
βββββββββββββββββββββββββ β
645 ΞΌs Histogram: frequency by time 2.56 ms <
Memory estimate: 18.14 KiB, allocs estimate: 763.
MWE
Click to see more:Julia Code
|