One example from `GPU programming in Julia | Workshop | JuliaCon 2021`

DennisFang · April 5, 2022, 4:21pm

When I run the following example from the Workshop, the result is false in my computer.

function my_sum(a::AbstractArray{T}) where {T}
    b = CUDA.zeros(T, 1)

    kernel = @cuda launch=false reduce_grid_atomic_shmem(+, a, b)

    config = launch_configuration(kernel.fun)
    threads = min(config.threads, length(a))
    blocks = cld(length(a), threads*2)

    @cuda threads=threads blocks=blocks reduce_grid_atomic_shmem(+, a, b)

    CUDA.@allowscalar b[]
end

a = CUDA.rand(1024, 1024)
my_sum(a) ≈ sum(a)

Then, I try to use Random.seed!(123) to check whether the answer is related to creation in random numbers.

using Random, CUDA

function reduce_grid_atomic_shmem(op, a::AbstractArray{T}, b) where {T}
    elements = blockDim().x * 2
    thread = threadIdx().x
    block = blockIdx().x
    offset = (block - 1) * elements

    # shared mem to buffer memory loads
    shared = @cuStaticSharedMem(T, (2048,))
    @inbounds shared[thread] = a[offset+thread]
    @inbounds shared[thread+blockDim().x] = a[offset+thread+blockDim().x]

    # parallel reduction of values in a block
    d = 1
    while d < elements
        sync_threads()
        index = 2 * d * (thread - 1) + 1
        @inbounds if index <= elements && offset + index + d <= length(a)
            shared[index] = op(shared[index], shared[index+d])
        end
        d *= 2
    end

    # atomic reduction
    if thread == 1
        CUDA.@atomic b[] = op(b[], shared[1])
    end

    return
end

function my_sum(a::AbstractArray{T}) where {T}
    b = CUDA.zeros(T, 1)

    kernel = @cuda launch=false reduce_grid_atomic_shmem(+, a, b)

    config = launch_configuration(kernel.fun)
    threads = min(config.threads, length(a))
    blocks = cld(length(a), threads*2)

    @cuda threads=threads blocks=blocks reduce_grid_atomic_shmem(+, a, b)

    CUDA.@allowscalar b[]
end

Random.seed!(123); a = CUDA.rand(1024, 1024)
@show my_sum(a) 
@show sum(a)
my_sum(a) ≈ sum(a)

Then, the result is as follows.

my_sum(a) = 586088.9f0
sum(a) = 523880.62f0
my_sum(a) ≈ sum(a) = false
false

How can I fix this problem?

Thanks.

Topic		Replies	Views
Create a simple CUDA.sum kernel GPU	3	1963	January 3, 2021
Different calculation results when using CPU vs. GPU with CUDA.jl General Usage question , gpu , cuda , atomic	2	1189	March 2, 2022
Accessing array elements too slow? GPU	10	593	April 23, 2021
Examples with atomic operations using CUDA.jl GPU	6	2895	September 15, 2020
Simulation running well on K620 but instable on A100? GPU gpu , cuda	15	1172	March 25, 2022

One example from `GPU programming in Julia | Workshop | JuliaCon 2021`

Related topics