GPU kernel that is ~20x slower than corresponding CPU version

Dale_James_Black · November 9, 2023, 4:00pm

I have a 2D distance transform algorithm on the GPU that is around ~20x slower than its corresponding CPU version. I don’t know if this is an issue with the way the kernel was written or potentially to be expected on Metal.jl? Any hints would be helpful, especially if you see something about the kernel itself, since idk much about profiling.

1D non-allocating base function

function transform!(f::AbstractVector, output, v,  z)
    z[1] = -Inf32
    z[2] = Inf32

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≤ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf32
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end

2D GPU Kernel

@kernel function transform_kernel_cols!(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
	backend = get_backend(img)
	kernel_cols = transform_kernel_cols!(backend)
	kernel_cols(img, output, v, z, ndrange = size(img, 1))
	
	B = similar(output)
	copyto!(B, output)
	
	kernel_rows = transform_kernel_rows!(backend)
	kernel_rows(B, output, v, z, ndrange = size(img, 2))
	KernelAbstractions.synchronize(backend)
end

2D CPU version

function transform!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

Timing results

# CPU
let
	n = 100
	img = rand([0f0, 1f0], n, n)
	img_bool = boolean_indicator(img)
	output = similar(img, eltype(img))
	v = ones(Int32, size(img))
	z = ones(eltype(img), size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  43.417 μs …  3.865 ms  ┊ GC (min … max): 0.00% … 94.38%
 Time  (median):     64.750 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   70.336 μs ± 45.136 μs  ┊ GC (mean ± σ):  0.52% ±  0.94%

      ▄█▅▄▂▂▃                                                  
  ▁▂▂▄████████▆▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  43.4 μs         Histogram: frequency by time         199 μs <

 Memory estimate: 4.59 KiB, allocs estimate: 47.

# GPU
let
	n = 100
	img = rand!(KernelAbstractions.allocate(backend, Float32, 10, 10)) .> 0.5f0
	img_bool = boolean_indicator(img)
	output = similar(img, Float32)
	v = KernelAbstractions.ones(backend, Int32, size(img))
	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
	
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

BenchmarkTools.Trial: 4069 samples with 1 evaluation.
 Range (min … max):  644.625 μs …  26.387 ms  ┊ GC (min … max): 0.00% … 62.32%
 Time  (median):       1.145 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):     1.224 ms ± 595.827 μs  ┊ GC (mean ± σ):  0.33% ±  0.98%

       ▁▇▇█▇▄▅▄▂▂▂▄▄▄▃▂▂▁                                        
  ▂▁▂▅▆████████████████████▇▅▅▅▄▄▅▅▅▆▅▄▃▃▃▃▃▃▃▃▂▃▂▂▃▂▂▁▂▂▂▂▂▁▂▂ ▄
  645 μs           Histogram: frequency by time         2.56 ms <

 Memory estimate: 18.14 KiB, allocs estimate: 763.

MWE

_{Click to see more:}

Julia Code

using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray
using KernelAbstractions
using Metal, CUDA, AMDGPU, oneAPI
using BenchmarkTools

if CUDA.functional()
	@info "Using CUDA"
	CUDA.versioninfo()
	backend = CUDABackend()
elseif AMDGPU.functional()
	@info "Using AMD"
	AMDGPU.versioninfo()
	backend = ROCBackend()
elseif oneAPI.functional()
	@info "Using oneAPI"
	oneAPI.versioninfo()
	backend = oneBackend()
elseif Metal.functional()
	@info "Using Metal"
	Metal.versioninfo()
	backend = MetalBackend()
else
    @info "No GPU is available. Using CPU."
	backend = CPU()
end

boolean_indicator(f) = @. ifelse(f == 0, 1f10, 0f0)

@kernel function boolean_indicator_kernel(f, output)
    i = @index(Global)
    output[i] = ifelse(f[i] == 0, 1f10, 0f0)
end

function boolean_indicator(f::AbstractGPUArray)
    backend = get_backend(f)
    output = similar(f, Float32)
    kernel = boolean_indicator_kernel(backend)
    kernel(f, output, ndrange=size(f))
    KernelAbstractions.synchronize(backend)
    return output
end

# 1D
function transform!(f::AbstractVector, output, v,  z)
    z[1] = -Inf32
    z[2] = Inf32

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≤ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf32
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end

# 2D CPU
function transform!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

# 2D GPU
@kernel function transform_kernel_cols!(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
    backend = get_backend(img)
    kernel_cols = transform_kernel_cols!(backend)
    kernel_cols(img, output, v, z, ndrange = size(img, 1))
	
    B = similar(output)
	copyto!(B, output)
    
    kernel_rows = transform_kernel_rows!(backend)
    kernel_rows(B, output, v, z, ndrange = size(img, 2))
    KernelAbstractions.synchronize(backend)
end

# Timing
let
        # CPU
	n = 100
	img = rand([0f0, 1f0], n, n)
	img_bool = boolean_indicator(img)
	output = similar(img, eltype(img))
	v = ones(Int32, size(img))
	z = ones(eltype(img), size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

let
        # GPU
	n = 100
	img = rand!(KernelAbstractions.allocate(backend, Float32, 10, 10)) .> 0.5f0
	img_bool = boolean_indicator(img)
	output = similar(img, Float32)
	v = KernelAbstractions.ones(backend, Int32, size(img))
	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
	
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

Wenbo_Li · November 10, 2023, 12:35am

I’m working on a CUDA computer(i9-13900k + RTX A5000) and I simply modified the GPU function to:

function transform_gpu!(img::AbstractGPUMatrix, output, v, z)
    s1, s2 = size(img)
    backend = get_backend(img)
    kernel_cols(img, output, v, z, ndrange = s1)
    
    copyto!(img, output)

    kernel_rows(img, output, v, z, ndrange = s2)
    # KernelAbstractions.synchronize(backend)
end

And I added copyto! to the CPU version to maintain the correctness:

function transform_wenbo!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end

		copyto!(img, output)

		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end

		copyto!(img, output)
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

Now the benchmark results gave me:

n = 100
img = rand([0f0, 1f0], n, n);

# CPU
img_cpu = boolean_indicator(img)
output_cpu = similar(img, eltype(img_cpu))
v_cpu = ones(Int32, size(img_cpu))
z_cpu = ones(eltype(img), size(img_cpu) .+ 1)
tfm = @benchmark transform_wenbo!($img_cpu, $output_cpu, $v_cpu, $z_cpu)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):   59.600 μs …   4.475 ms  ┊ GC (min … max): 0.00% … 89.63%
 Time  (median):      81.100 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   116.181 μs ± 163.679 μs  ┊ GC (mean ± σ):  2.27% ±  2.65%

  ▂▆█▇▆▂▁   ▁▁▁    ▁  ▁▂▃▂▁                                     ▂
  ██████████████▇▇███▇█████▇█▇▇▆▇▆▆▆▇▇██▇▆▅▅▆▅▇▅▅▄▃▃▂▃▅▅▂▅▅▄▅▄▄ █
  59.6 μs       Histogram: log(frequency) by time        475 μs <

 Memory estimate: 42.03 KiB, allocs estimate: 351.

# GPU
begin
	img_gpu = CuArray(boolean_indicator(img))
	output_gpu = similar(img_gpu)
	v_gpu = KernelAbstractions.ones(backend, Int32, size(img_gpu))
	z_gpu = KernelAbstractions.ones(backend, Float32, size(img_gpu) .+ 1)
	
	tfm = @benchmark transform_gpu!($img_gpu, $output_gpu, $v_gpu, $z_gpu)
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):    8.800 μs … 15.898 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):      12.800 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   212.305 μs ±  1.645 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █                                                             
  █▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂ ▂
  8.8 μs          Histogram: frequency by time         13.7 ms <

 Memory estimate: 5.69 KiB, allocs estimate: 120.

Wenbo_Li · November 10, 2023, 12:39am

Now, I switched to my Macbook with a M2 chip and ran the same code:

n = 1000
img = rand([0f0, 1f0], n, n);

# CPU
img_cpu = boolean_indicator(img)
output_cpu = similar(img, eltype(img_cpu))
v_cpu = ones(Int32, size(img_cpu))
z_cpu = ones(eltype(img), size(img_cpu) .+ 1)
tfm = @benchmark transform_wenbo!($img_cpu, $output_cpu, $v_cpu, $z_cpu)

BenchmarkTools.Trial: 989 samples with 1 evaluation.
 Range (min … max):  4.801 ms …  13.822 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     4.873 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.051 ms ± 577.966 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ██▅▄▄▂▂                                                      
  █████████▇▅█▅▇▆▅▄▇▇▆▅▅▅▅▄▄▆▅▇▅▆▅▅▁▄▅▅▁▅▄▆▁▅▅▁▁▄▄▁▁▅▁▁▁▁▁▁▁▄ █
  4.8 ms       Histogram: log(frequency) by time       7.5 ms <

 Memory estimate: 4.44 KiB, allocs estimate: 51.

# GPU
begin
	img_gpu = MtlArray(boolean_indicator(img))
	output_gpu = similar(img_gpu)
	v_gpu = KernelAbstractions.ones(backend, Int32, size(img_gpu))
	z_gpu = KernelAbstractions.ones(backend, Float32, size(img_gpu) .+ 1)
	
	tfm = @benchmark transform_gpu!($img_gpu, $output_gpu, $v_gpu, $z_gpu)
end

BenchmarkTools.Trial: 259 samples with 1 evaluation.
 Range (min … max):  16.567 ms … 27.834 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     18.644 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   19.323 ms ±  1.168 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

                      ▃█▇                                      
  ▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▄███▆▄▃▃▃▂▃▂▂▃▃▂▁▄▃▃▃▃▃▂▃▄▅▅▃▄▃▃▃▃▁▁▁▁▁▃ ▃
  16.6 ms         Histogram: frequency by time        21.9 ms <

 Memory estimate: 22.28 KiB, allocs estimate: 1082.

I have no idea why Metal has such a difference in benchmark timing, compared to CUDA.

maleadt · November 10, 2023, 7:58am

Metal.jl hasn’t been optimized yet, so it’s likely that’s there some very low-hanging fruit wrt. performance.

Dale_James_Black · November 10, 2023, 11:38am

Okay that makes sense. It looks like CUDA is still 2x slower though but the histogram looks very uneven, which seems odd. I’ll try to get a CUDA machine soon

Dale_James_Black · November 10, 2023, 11:39am

Do you have any idea why there is such a large discrepancy between the median and mean times on the CUDA machine?

maleadt · November 10, 2023, 11:57am

That’s invalid; by removing the synchronization, you’re benchmarking the time it takes to submit the kernel, not the time to execute. That said, you generally shouldn’t put synchronizations in your algorithms, but only around the place where you benchmark code.

maleadt · November 10, 2023, 12:16pm

Try using CUDA.@profile, or the new CUDA.@bprofile (currently only on CUDA.jl#master).

In any case, I couldn’t get your MWE to work: no definition of boolean_indicator, n is not used in the GPU benchmark, etc. Please make it easier for people to help you by providing a simple and functional MWE. I’m currently using the following:

using CUDA, KernelAbstractions, BenchmarkTools, Random
using CUDA.CUDAKernels

function transform!(f::AbstractVector, output, v,  z)
    z[1] = -Inf32
    z[2] = Inf32

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≤ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf32
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end


@kernel function transform_kernel_cols!(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractMatrix, output, v, z)
	backend = get_backend(img)
	kernel_cols = transform_kernel_cols!(backend)
	kernel_cols(img, output, v, z, ndrange = size(img, 1))

	B = similar(output)
	copyto!(B, output)

	kernel_rows = transform_kernel_rows!(backend)
	kernel_rows(B, output, v, z, ndrange = size(img, 2))
	KernelAbstractions.synchronize(backend)
end


function main(n = 1000)
    backend = CUDABackend()

    # cpu
	img = rand([0f0, 1f0], n, n)
    img_bool = similar(img, Bool)
	output = similar(img, eltype(img))
	v = ones(Int32, size(img))
	z = ones(eltype(img), size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
    display(tfm)

    # gpu
	img = rand!(KernelAbstractions.allocate(backend, Float32, n, n)) .> 0.5f0
    img_bool = similar(img, Bool)
	output = similar(img, Float32)
	v = KernelAbstractions.ones(backend, Int32, size(img))
	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
    display(tfm)
end

… which indeed doesn’t yield a speed-up using the GPU:

julia> main()
BenchmarkTools.Trial: 408 samples with 1 evaluation.
 Range (min … max):  12.164 ms …  13.377 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     12.194 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   12.263 ms ± 184.607 μs  ┊ GC (mean ± σ):  0.14% ± 0.67%

  ▅█▅▂▁▁                          ▁▁
  ███████▇▆▄█▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▆██▆▆▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▆
  12.2 ms       Histogram: log(frequency) by time      13.1 ms <

 Memory estimate: 3.82 MiB, allocs estimate: 22.
BenchmarkTools.Trial: 528 samples with 1 evaluation.
 Range (min … max):  9.377 ms …  9.874 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     9.478 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   9.476 ms ± 73.175 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

      ▂▇▆▄█▆▁  ▁▂▇▆▄█▄                             ▁
  ▆▆▇▆███████▁▁███████▄▄▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▄▆▄▆▁▁▆▄▄▄▄▁█▁▄▁▁▄▁▄▆ █
  9.38 ms      Histogram: log(frequency) by time     9.78 ms <

 Memory estimate: 5.48 KiB, allocs estimate: 128.

Running under the CUDA profiler reveals that very few threads are launched:

julia> CUDA.@profile trace=true transform!(img_bool, output, v, z)

Device-side activity: GPU was busy for 8.0 ms (98.74% of the trace)
┌────┬──────────┬─────────┬─────────┬────────┬──────┬───────────┬───────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
│ ID │    Start │    Time │ Threads │ Blocks │ Regs │      Size │    Throughput │ Name                                                                                                                               ⋯
├────┼──────────┼─────────┼─────────┼────────┼──────┼───────────┼───────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
│  4 │ 67.95 µs │ 3.29 ms │     896 │      2 │   70 │         - │             - │ _Z26gpu_transform_kernel_cols_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRang ⋯
│ 17 │  3.35 ms │ 4.77 µs │       - │      - │    - │ 3.815 MiB │ 781.250 GiB/s │ [copy device to device memory]                                                                                                     ⋯
│ 21 │  3.36 ms │ 4.71 ms │     640 │      2 │   48 │         - │             - │ _Z26gpu_transform_kernel_rows_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRang ⋯
└────┴──────────┴─────────┴─────────┴────────┴──────┴───────────┴───────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

Only 2 blocks of 896 threads is way to few to efficiently use the GPU. You should simplify the work that’s done by each thread, and launch more of them.

Dale_James_Black · November 10, 2023, 2:30pm

Dale_James_Black:

using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray
using KernelAbstractions
using Metal, CUDA, AMDGPU, oneAPI
using BenchmarkTools

if CUDA.functional()
	@info "Using CUDA"
	CUDA.versioninfo()
	backend = CUDABackend()
elseif AMDGPU.functional()
	@info "Using AMD"
	AMDGPU.versioninfo()
	backend = ROCBackend()
elseif oneAPI.functional()
	@info "Using oneAPI"
	oneAPI.versioninfo()
	backend = oneBackend()
elseif Metal.functional()
	@info "Using Metal"
	Metal.versioninfo()
	backend = MetalBackend()
else
    @info "No GPU is available. Using CPU."
	backend = CPU()
end

boolean_indicator(f) = @. ifelse(f == 0, 1f10, 0f0)

@kernel function boolean_indicator_kernel(f, output)
    i = @index(Global)
    output[i] = ifelse(f[i] == 0, 1f10, 0f0)
end

function boolean_indicator(f::AbstractGPUArray)
    backend = get_backend(f)
    output = similar(f, Float32)
    kernel = boolean_indicator_kernel(backend)
    kernel(f, output, ndrange=size(f))
    KernelAbstractions.synchronize(backend)
    return output
end

# 1D
function transform!(f::AbstractVector, output, v,  z)
    z[1] = -Inf32
    z[2] = Inf32

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≤ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf32
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end

# 2D CPU
function transform!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

# 2D GPU
@kernel function transform_kernel_cols!(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
    backend = get_backend(img)
    kernel_cols = transform_kernel_cols!(backend)
    kernel_cols(img, output, v, z, ndrange = size(img, 1))
	
    B = similar(output)
	copyto!(B, output)
    
    kernel_rows = transform_kernel_rows!(backend)
    kernel_rows(B, output, v, z, ndrange = size(img, 2))
    KernelAbstractions.synchronize(backend)
end

# Timing
let
        # CPU
	n = 100
	img = rand([0f0, 1f0], n, n)
	img_bool = boolean_indicator(img)
	output = similar(img, eltype(img))
	v = ones(Int32, size(img))
	z = ones(eltype(img), size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

let
        # GPU
	n = 100
	img = rand!(KernelAbstractions.allocate(backend, Float32, 10, 10)) .> 0.5f0
	img_bool = boolean_indicator(img)
	output = similar(img, Float32)
	v = KernelAbstractions.ones(backend, Int32, size(img))
	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
	
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

Ahh thank you for looking into this, and sorry this MWE didn’t work. I’ll try to modify it once I get back on my computer.

So just to clarify, you’re saying this is likely an issue with underlying 1D transform! algorithm being too complex, and not something that can be fixed by setting the workgroup size or something like that?

maleadt · November 10, 2023, 9:35pm

Yeah, tweaking the workgroup size can’t go beyond the amount of parallelism that’s exposed, which is limited given how you launch with ndrange = size(img, 2). You probably will have to try to split up that function, and have multiple threads perform the work that’s now performed by a a for loop on a single thread.

Another possible performance issue may be the data dependent control flow (from the inner while loop) which may introduce thread divergence. To identify that, you can use the NVIDIA profilers, specifically NSight Compute.

Topic		Replies	Views
How much faster is GPU compare to CPU GPU	16	26976	November 24, 2018
GPU kernel optimization (GPU vs CPU) GPU	3	1528	December 14, 2018
Thinking through distance matrix calculation GPU gpu , multithreading	5	2788	May 16, 2017
Performance of view with cuArrays GPU	11	2705	November 11, 2018
CUDA v2 - performance regression on matrix multiplication GPU	14	1764	November 10, 2020

GPU kernel that is ~20x slower than corresponding CPU version

Julia Code

Related topics