GPU: Scalar indexing in kernel programming

Hi all,

I have a vector x of dim m*n and want to parallel an operation over each slice of length m. For example, x_{1:m} corresponds to a value -log(x[1]^2 - sum_{i=2}^m x[i]^2) as shown in the function compute_barrier!() (GPU) and serial_compute_barrier!() (CPU multithreads) below:

using LinearAlgebra, SparseArrays
using CUDA
using BenchmarkTools
using Revise

abstract type AbstractStruct{T <: AbstractArray, Td <: AbstractFloat} end 
#struct for atomic operations
struct MyStruct{T,Td} <: AbstractStruct{T, Td}
    data::AbstractArray{Td,1}
    views::AbstractArray{Int,2}

    function MyStruct{T,Td}(m::Int, n::Int) where {T,Td}
        data = T(zeros(Td, m * n))
        views = T(Array{Int,2}(undef, n,2))
        
        for i in 1:n
            start_index = (i - 1) * m + 1
            end_index = i * m
            views[i,1] = start_index
            views[i,2] = end_index
        end
        
        return new(data, views)
    end
end


# value initialization
function set_value!(    
    conicvec_data::AbstractArray{T},
    conicvec_views::AbstractArray{Int}
) where {T}
    i = (blockIdx().x-1)*blockDim().x+threadIdx().x
    
    if i <= size(conicvec_views, 1)
        start_index = conicvec_views[i, 1]
        end_index = conicvec_views[i, 2]

        @inbounds for j = start_index+1:end_index
            conicvec_data[j] = one(T)
        end

        conicvec_data[start_index] = T(end_index - start_index + 1)
    end
    
    return nothing
end

function serial_set_value!(    
    conicvec_data::AbstractArray{T},
    conicvec_views::AbstractArray{Int}
) where {T}

    @inbounds Threads.@threads for i = 1:size(conicvec_views, 1)
        start_index = conicvec_views[i, 1]
        end_index = conicvec_views[i, 2]

        @inbounds for j = start_index+1:end_index
            conicvec_data[j] = one(T)
        end

        conicvec_data[start_index] = T(end_index - start_index + 1)
    end
    
    return nothing
end


#log barrier function for SOCs
function compute_barrier!(
    value_vec::AbstractArray{T},
    conicvec_data::AbstractArray{T},
    conicvec_views::AbstractArray{Int}
) where {T}
    i = (blockIdx().x-1)*blockDim().x+threadIdx().x

    if i <= size(conicvec_views, 1)
        start_index = conicvec_views[i, 1]
        end_index = conicvec_views[i, 2]
        @assert (end_index - start_index) > 0   #length>=2

        tmp = conicvec_data[start_index]^2
        @inbounds for j = start_index+1:end_index
            tmp -= conicvec_data[j]^2
        end
        value_vec[i] = -log(tmp)

        # value_vec[i] = log(conicvec_data[start_index]^2 - norm(conicvec_data[start_index+1:end_index]))
    end
    
    return nothing
end

function serial_compute_barrier!(
    value_vec::AbstractArray{T},
    conicvec_data::AbstractArray{T},
    conicvec_views::AbstractArray{Int}
) where {T}

    @inbounds Threads.@threads for i = 1:size(conicvec_views, 1)
        start_index = conicvec_views[i, 1]
        end_index = conicvec_views[i, 2]
        @assert (end_index - start_index) > 0   #length>=2

        tmp = conicvec_data[start_index]^2
        @inbounds for j = start_index+1:end_index
            tmp -= conicvec_data[j]^2
        end
        value_vec[i] = -log(tmp)

    end
    
    return nothing
end

m = 4
n = 32*1024
T = CuArray     # Set T to either CuArray (GPU version) or Array (CPU version)
Td = Float64
conicvec = MyStruct{T,Td}(m, n)
value_vec = T(zeros(Td,n))

if T == CuArray
    @benchmark kernel = begin
        CUDA.@sync @cuda threads=1024 blocks=cld(length(conicvec.views),1024) set_value!(conicvec.data,conicvec.views)
        CUDA.@sync @cuda threads=1024 blocks=cld(length(conicvec.views),1024) compute_barrier!(value_vec,conicvec.data,conicvec.views)        
    end
else
    @benchmark begin
        serial_set_value!(conicvec.data,conicvec.views)
        serial_compute_barrier!(value_vec,conicvec.data,conicvec.views)        
    end
end

When I was running the code above, it returns a warning:
‘’’
┌ Warning: Performing scalar indexing on task Task (runnable) @0x000000000a150010.
│ Invocation of setindex! resulted in scalar indexing of a GPU array.
│ This is typically caused by calling an iterating implementation of a method.
│ Such implementations do not execute on the GPU, but very slowly on the CPU,
│ and therefore are only permitted from the REPL for prototyping purposes.
│ If you did intend to index this array, annotate the caller with @allowscalar.
└ @ GPUArraysCore C:\Users\ddt00.julia\packages\GPUArraysCore\HaQcr\src\GPUArraysCore.jl:106
‘’’
I’m wondering whether this is an issue if I write my code like kernel programming above?

The mean of GPU one is about 60 us while the multithread one (CPU with 12 threads) is about 140 us. Though the GPU version seems 2x faster than the CPU-multithread, how can I verify the computation is implemented on a GPU rather than on a CPU?

In addition, is there any efficient way to add @allowscalar if I use the scalar indexing multiple times across my code? The tutorial seems to be very inefficient for my case.

Scalar indexing in kernels isn’t the reason for the warning here, it’s when you access individual array elements from the host. Your kernels are fine, the problem is probably with your data initialization. It’s generally better to initialize on the CPU and copy the initialized data over to the GPU.

You’re using @cuda, so this is executing on the GPU.

Oh, I didn’t realize that. Thank you so much!