GPU kernel that is ~20x slower than corresponding CPU version

I have a 2D distance transform algorithm on the GPU that is around ~20x slower than its corresponding CPU version. I don’t know if this is an issue with the way the kernel was written or potentially to be expected on Metal.jl? Any hints would be helpful, especially if you see something about the kernel itself, since idk much about profiling.

1D non-allocating base function

function transform!(f::AbstractVector, output, v,  z)
    z[1] = -Inf32
    z[2] = Inf32

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≀ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf32
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end

2D GPU Kernel

@kernel function transform_kernel_cols!(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
	backend = get_backend(img)
	kernel_cols = transform_kernel_cols!(backend)
	kernel_cols(img, output, v, z, ndrange = size(img, 1))
	
	B = similar(output)
	copyto!(B, output)
	
	kernel_rows = transform_kernel_rows!(backend)
	kernel_rows(B, output, v, z, ndrange = size(img, 2))
	KernelAbstractions.synchronize(backend)
end

2D CPU version

function transform!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

Timing results

# CPU
let
	n = 100
	img = rand([0f0, 1f0], n, n)
	img_bool = boolean_indicator(img)
	output = similar(img, eltype(img))
	v = ones(Int32, size(img))
	z = ones(eltype(img), size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  43.417 ΞΌs …  3.865 ms  β”Š GC (min … max): 0.00% … 94.38%
 Time  (median):     64.750 ΞΌs              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   70.336 ΞΌs Β± 45.136 ΞΌs  β”Š GC (mean Β± Οƒ):  0.52% Β±  0.94%

      β–„β–ˆβ–…β–„β–‚β–‚β–ƒ                                                  
  β–β–‚β–‚β–„β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–†β–„β–ƒβ–ƒβ–‚β–‚β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β– β–‚
  43.4 ΞΌs         Histogram: frequency by time         199 ΞΌs <

 Memory estimate: 4.59 KiB, allocs estimate: 47.

# GPU
let
	n = 100
	img = rand!(KernelAbstractions.allocate(backend, Float32, 10, 10)) .> 0.5f0
	img_bool = boolean_indicator(img)
	output = similar(img, Float32)
	v = KernelAbstractions.ones(backend, Int32, size(img))
	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
	
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

BenchmarkTools.Trial: 4069 samples with 1 evaluation.
 Range (min … max):  644.625 ΞΌs …  26.387 ms  β”Š GC (min … max): 0.00% … 62.32%
 Time  (median):       1.145 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):     1.224 ms Β± 595.827 ΞΌs  β”Š GC (mean Β± Οƒ):  0.33% Β±  0.98%

       β–β–‡β–‡β–ˆβ–‡β–„β–…β–„β–‚β–‚β–‚β–„β–„β–„β–ƒβ–‚β–‚β–                                        
  β–‚β–β–‚β–…β–†β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–…β–…β–…β–„β–„β–…β–…β–…β–†β–…β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–ƒβ–‚β–‚β–ƒβ–‚β–‚β–β–‚β–‚β–‚β–‚β–‚β–β–‚β–‚ β–„
  645 ΞΌs           Histogram: frequency by time         2.56 ms <

 Memory estimate: 18.14 KiB, allocs estimate: 763.

MWE

Click to see more:
Julia Code
using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray
using KernelAbstractions
using Metal, CUDA, AMDGPU, oneAPI
using BenchmarkTools

if CUDA.functional()
	@info "Using CUDA"
	CUDA.versioninfo()
	backend = CUDABackend()
elseif AMDGPU.functional()
	@info "Using AMD"
	AMDGPU.versioninfo()
	backend = ROCBackend()
elseif oneAPI.functional()
	@info "Using oneAPI"
	oneAPI.versioninfo()
	backend = oneBackend()
elseif Metal.functional()
	@info "Using Metal"
	Metal.versioninfo()
	backend = MetalBackend()
else
    @info "No GPU is available. Using CPU."
	backend = CPU()
end

boolean_indicator(f) = @. ifelse(f == 0, 1f10, 0f0)

@kernel function boolean_indicator_kernel(f, output)
    i = @index(Global)
    output[i] = ifelse(f[i] == 0, 1f10, 0f0)
end

function boolean_indicator(f::AbstractGPUArray)
    backend = get_backend(f)
    output = similar(f, Float32)
    kernel = boolean_indicator_kernel(backend)
    kernel(f, output, ndrange=size(f))
    KernelAbstractions.synchronize(backend)
    return output
end

# 1D
function transform!(f::AbstractVector, output, v,  z)
    z[1] = -Inf32
    z[2] = Inf32

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≀ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf32
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end

# 2D CPU
function transform!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

# 2D GPU
@kernel function transform_kernel_cols!(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
    backend = get_backend(img)
    kernel_cols = transform_kernel_cols!(backend)
    kernel_cols(img, output, v, z, ndrange = size(img, 1))
	
    B = similar(output)
	copyto!(B, output)
    
    kernel_rows = transform_kernel_rows!(backend)
    kernel_rows(B, output, v, z, ndrange = size(img, 2))
    KernelAbstractions.synchronize(backend)
end

# Timing
let
        # CPU
	n = 100
	img = rand([0f0, 1f0], n, n)
	img_bool = boolean_indicator(img)
	output = similar(img, eltype(img))
	v = ones(Int32, size(img))
	z = ones(eltype(img), size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

let
        # GPU
	n = 100
	img = rand!(KernelAbstractions.allocate(backend, Float32, 10, 10)) .> 0.5f0
	img_bool = boolean_indicator(img)
	output = similar(img, Float32)
	v = KernelAbstractions.ones(backend, Int32, size(img))
	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
	
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
end

I’m working on a CUDA computer(i9-13900k + RTX A5000) and I simply modified the GPU function to:

function transform_gpu!(img::AbstractGPUMatrix, output, v, z)
    s1, s2 = size(img)
    backend = get_backend(img)
    kernel_cols(img, output, v, z, ndrange = s1)
    
    copyto!(img, output)

    kernel_rows(img, output, v, z, ndrange = s2)
    # KernelAbstractions.synchronize(backend)
end

And I added copyto! to the CPU version to maintain the correctness:

function transform_wenbo!(img::AbstractMatrix, output, v, z; threaded = true)
	if threaded
		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end

		copyto!(img, output)

		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	else
		for i in CartesianIndices(@view(img[:, 1]))
		    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
		end

		copyto!(img, output)
		
		for j in CartesianIndices(@view(img[1, :]))
		    @views transform!(
				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
			)
		end
	end
end

Now the benchmark results gave me:

n = 100
img = rand([0f0, 1f0], n, n);

# CPU
img_cpu = boolean_indicator(img)
output_cpu = similar(img, eltype(img_cpu))
v_cpu = ones(Int32, size(img_cpu))
z_cpu = ones(eltype(img), size(img_cpu) .+ 1)
tfm = @benchmark transform_wenbo!($img_cpu, $output_cpu, $v_cpu, $z_cpu)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):   59.600 ΞΌs …   4.475 ms  β”Š GC (min … max): 0.00% … 89.63%
 Time  (median):      81.100 ΞΌs               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   116.181 ΞΌs Β± 163.679 ΞΌs  β”Š GC (mean Β± Οƒ):  2.27% Β±  2.65%

  β–‚β–†β–ˆβ–‡β–†β–‚β–   ▁▁▁    ▁  ▁▂▃▂▁                                     β–‚
  β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–ˆβ–ˆβ–ˆβ–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–ˆβ–‡β–‡β–†β–‡β–†β–†β–†β–‡β–‡β–ˆβ–ˆβ–‡β–†β–…β–…β–†β–…β–‡β–…β–…β–„β–ƒβ–ƒβ–‚β–ƒβ–…β–…β–‚β–…β–…β–„β–…β–„β–„ β–ˆ
  59.6 ΞΌs       Histogram: log(frequency) by time        475 ΞΌs <

 Memory estimate: 42.03 KiB, allocs estimate: 351.

# GPU
begin
	img_gpu = CuArray(boolean_indicator(img))
	output_gpu = similar(img_gpu)
	v_gpu = KernelAbstractions.ones(backend, Int32, size(img_gpu))
	z_gpu = KernelAbstractions.ones(backend, Float32, size(img_gpu) .+ 1)
	
	tfm = @benchmark transform_gpu!($img_gpu, $output_gpu, $v_gpu, $z_gpu)
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):    8.800 ΞΌs … 15.898 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):      12.800 ΞΌs              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   212.305 ΞΌs Β±  1.645 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–ˆ                                                             
  β–ˆβ–β–β–β–β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–‚β–‚β–‚β–‚β–‚ β–‚
  8.8 ΞΌs          Histogram: frequency by time         13.7 ms <

 Memory estimate: 5.69 KiB, allocs estimate: 120.

Now, I switched to my Macbook with a M2 chip and ran the same code:

n = 1000
img = rand([0f0, 1f0], n, n);

# CPU
img_cpu = boolean_indicator(img)
output_cpu = similar(img, eltype(img_cpu))
v_cpu = ones(Int32, size(img_cpu))
z_cpu = ones(eltype(img), size(img_cpu) .+ 1)
tfm = @benchmark transform_wenbo!($img_cpu, $output_cpu, $v_cpu, $z_cpu)

BenchmarkTools.Trial: 989 samples with 1 evaluation.
 Range (min … max):  4.801 ms …  13.822 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     4.873 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   5.051 ms Β± 577.966 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–ˆβ–ˆβ–…β–„β–„β–‚β–‚                                                      
  β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–…β–ˆβ–…β–‡β–†β–…β–„β–‡β–‡β–†β–…β–…β–…β–…β–„β–„β–†β–…β–‡β–…β–†β–…β–…β–β–„β–…β–…β–β–…β–„β–†β–β–…β–…β–β–β–„β–„β–β–β–…β–β–β–β–β–β–β–β–„ β–ˆ
  4.8 ms       Histogram: log(frequency) by time       7.5 ms <

 Memory estimate: 4.44 KiB, allocs estimate: 51.

# GPU
begin
	img_gpu = MtlArray(boolean_indicator(img))
	output_gpu = similar(img_gpu)
	v_gpu = KernelAbstractions.ones(backend, Int32, size(img_gpu))
	z_gpu = KernelAbstractions.ones(backend, Float32, size(img_gpu) .+ 1)
	
	tfm = @benchmark transform_gpu!($img_gpu, $output_gpu, $v_gpu, $z_gpu)
end

BenchmarkTools.Trial: 259 samples with 1 evaluation.
 Range (min … max):  16.567 ms … 27.834 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     18.644 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   19.323 ms Β±  1.168 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

                      β–ƒβ–ˆβ–‡                                      
  β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–‚β–„β–ˆβ–ˆβ–ˆβ–†β–„β–ƒβ–ƒβ–ƒβ–‚β–ƒβ–‚β–‚β–ƒβ–ƒβ–‚β–β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–ƒβ–„β–…β–…β–ƒβ–„β–ƒβ–ƒβ–ƒβ–ƒβ–β–β–β–β–β–ƒ β–ƒ
  16.6 ms         Histogram: frequency by time        21.9 ms <

 Memory estimate: 22.28 KiB, allocs estimate: 1082.

I have no idea why Metal has such a difference in benchmark timing, compared to CUDA.

Metal.jl hasn’t been optimized yet, so it’s likely that’s there some very low-hanging fruit wrt. performance.

Okay that makes sense. It looks like CUDA is still 2x slower though but the histogram looks very uneven, which seems odd. I’ll try to get a CUDA machine soon

Do you have any idea why there is such a large discrepancy between the median and mean times on the CUDA machine?

That’s invalid; by removing the synchronization, you’re benchmarking the time it takes to submit the kernel, not the time to execute. That said, you generally shouldn’t put synchronizations in your algorithms, but only around the place where you benchmark code.

Try using CUDA.@profile, or the new CUDA.@bprofile (currently only on CUDA.jl#master).

In any case, I couldn’t get your MWE to work: no definition of boolean_indicator, n is not used in the GPU benchmark, etc. Please make it easier for people to help you by providing a simple and functional MWE. I’m currently using the following:

using CUDA, KernelAbstractions, BenchmarkTools, Random
using CUDA.CUDAKernels

function transform!(f::AbstractVector, output, v,  z)
    z[1] = -Inf32
    z[2] = Inf32

    k = 1
    @inbounds for q in 2:length(f)
        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        while s ≀ z[k]
            k -= 1
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2*q - 2*v[k])
        end
        k += 1
        v[k] = q
        z[k] = s
        z[k + 1] = Inf32
    end

    k = 1
    @inbounds for q in 1:length(f)
        while z[k + 1] < q
            k += 1
        end
        output[q] = (q - v[k])^2 + f[v[k]]
    end
end


@kernel function transform_kernel_cols!(img, output, v, z)
	i = @index(Global)
	@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
	j = @index(Global)
	@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractMatrix, output, v, z)
	backend = get_backend(img)
	kernel_cols = transform_kernel_cols!(backend)
	kernel_cols(img, output, v, z, ndrange = size(img, 1))

	B = similar(output)
	copyto!(B, output)

	kernel_rows = transform_kernel_rows!(backend)
	kernel_rows(B, output, v, z, ndrange = size(img, 2))
	KernelAbstractions.synchronize(backend)
end


function main(n = 1000)
    backend = CUDABackend()

    # cpu
	img = rand([0f0, 1f0], n, n)
    img_bool = similar(img, Bool)
	output = similar(img, eltype(img))
	v = ones(Int32, size(img))
	z = ones(eltype(img), size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
    display(tfm)

    # gpu
	img = rand!(KernelAbstractions.allocate(backend, Float32, n, n)) .> 0.5f0
    img_bool = similar(img, Bool)
	output = similar(img, Float32)
	v = KernelAbstractions.ones(backend, Int32, size(img))
	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
	tfm = @benchmark transform!($img_bool, $output, $v, $z)
    display(tfm)
end

… which indeed doesn’t yield a speed-up using the GPU:

julia> main()
BenchmarkTools.Trial: 408 samples with 1 evaluation.
 Range (min … max):  12.164 ms …  13.377 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     12.194 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   12.263 ms Β± 184.607 ΞΌs  β”Š GC (mean Β± Οƒ):  0.14% Β± 0.67%

  β–…β–ˆβ–…β–‚β–β–                          ▁▁
  β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–†β–„β–ˆβ–†β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–„β–†β–ˆβ–ˆβ–†β–†β–β–…β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–„ β–†
  12.2 ms       Histogram: log(frequency) by time      13.1 ms <

 Memory estimate: 3.82 MiB, allocs estimate: 22.
BenchmarkTools.Trial: 528 samples with 1 evaluation.
 Range (min … max):  9.377 ms …  9.874 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     9.478 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   9.476 ms Β± 73.175 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

      β–‚β–‡β–†β–„β–ˆβ–†β–  β–β–‚β–‡β–†β–„β–ˆβ–„                             ▁
  β–†β–†β–‡β–†β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–β–β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–„β–„β–β–β–„β–β–β–β–β–β–β–β–β–β–β–β–β–„β–†β–„β–†β–β–β–†β–„β–„β–„β–„β–β–ˆβ–β–„β–β–β–„β–β–„β–† β–ˆ
  9.38 ms      Histogram: log(frequency) by time     9.78 ms <

 Memory estimate: 5.48 KiB, allocs estimate: 128.

Running under the CUDA profiler reveals that very few threads are launched:

julia> CUDA.@profile trace=true transform!(img_bool, output, v, z)

Device-side activity: GPU was busy for 8.0 ms (98.74% of the trace)
β”Œβ”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ ID β”‚    Start β”‚    Time β”‚ Threads β”‚ Blocks β”‚ Regs β”‚      Size β”‚    Throughput β”‚ Name                                                                                                                               β‹―
β”œβ”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚  4 β”‚ 67.95 Β΅s β”‚ 3.29 ms β”‚     896 β”‚      2 β”‚   70 β”‚         - β”‚             - β”‚ _Z26gpu_transform_kernel_cols_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRang β‹―
β”‚ 17 β”‚  3.35 ms β”‚ 4.77 Β΅s β”‚       - β”‚      - β”‚    - β”‚ 3.815 MiB β”‚ 781.250 GiB/s β”‚ [copy device to device memory]                                                                                                     β‹―
β”‚ 21 β”‚  3.36 ms β”‚ 4.71 ms β”‚     640 β”‚      2 β”‚   48 β”‚         - β”‚             - β”‚ _Z26gpu_transform_kernel_rows_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRang β‹―
└────┴──────────┴─────────┴─────────┴────────┴──────┴───────────┴───────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

Only 2 blocks of 896 threads is way to few to efficiently use the GPU. You should simplify the work that’s done by each thread, and launch more of them.

Ahh thank you for looking into this, and sorry this MWE didn’t work. I’ll try to modify it once I get back on my computer.

So just to clarify, you’re saying this is likely an issue with underlying 1D transform! algorithm being too complex, and not something that can be fixed by setting the workgroup size or something like that?

Yeah, tweaking the workgroup size can’t go beyond the amount of parallelism that’s exposed, which is limited given how you launch with ndrange = size(img, 2). You probably will have to try to split up that function, and have multiple threads perform the work that’s now performed by a a for loop on a single thread.

Another possible performance issue may be the data dependent control flow (from the inner while loop) which may introduce thread divergence. To identify that, you can use the NVIDIA profilers, specifically NSight Compute.

1 Like