Hi all,
I have a vector x of dim m*n and want to parallel an operation over each slice of length m. For example, x_{1:m} corresponds to a value -log(x[1]^2 - sum_{i=2}^m x[i]^2) as shown in the function compute_barrier!() (GPU) and serial_compute_barrier!() (CPU multithreads) below:
using LinearAlgebra, SparseArrays
using CUDA
using BenchmarkTools
using Revise
abstract type AbstractStruct{T <: AbstractArray, Td <: AbstractFloat} end
#struct for atomic operations
struct MyStruct{T,Td} <: AbstractStruct{T, Td}
data::AbstractArray{Td,1}
views::AbstractArray{Int,2}
function MyStruct{T,Td}(m::Int, n::Int) where {T,Td}
data = T(zeros(Td, m * n))
views = T(Array{Int,2}(undef, n,2))
for i in 1:n
start_index = (i - 1) * m + 1
end_index = i * m
views[i,1] = start_index
views[i,2] = end_index
end
return new(data, views)
end
end
# value initialization
function set_value!(
conicvec_data::AbstractArray{T},
conicvec_views::AbstractArray{Int}
) where {T}
i = (blockIdx().x-1)*blockDim().x+threadIdx().x
if i <= size(conicvec_views, 1)
start_index = conicvec_views[i, 1]
end_index = conicvec_views[i, 2]
@inbounds for j = start_index+1:end_index
conicvec_data[j] = one(T)
end
conicvec_data[start_index] = T(end_index - start_index + 1)
end
return nothing
end
function serial_set_value!(
conicvec_data::AbstractArray{T},
conicvec_views::AbstractArray{Int}
) where {T}
@inbounds Threads.@threads for i = 1:size(conicvec_views, 1)
start_index = conicvec_views[i, 1]
end_index = conicvec_views[i, 2]
@inbounds for j = start_index+1:end_index
conicvec_data[j] = one(T)
end
conicvec_data[start_index] = T(end_index - start_index + 1)
end
return nothing
end
#log barrier function for SOCs
function compute_barrier!(
value_vec::AbstractArray{T},
conicvec_data::AbstractArray{T},
conicvec_views::AbstractArray{Int}
) where {T}
i = (blockIdx().x-1)*blockDim().x+threadIdx().x
if i <= size(conicvec_views, 1)
start_index = conicvec_views[i, 1]
end_index = conicvec_views[i, 2]
@assert (end_index - start_index) > 0 #length>=2
tmp = conicvec_data[start_index]^2
@inbounds for j = start_index+1:end_index
tmp -= conicvec_data[j]^2
end
value_vec[i] = -log(tmp)
# value_vec[i] = log(conicvec_data[start_index]^2 - norm(conicvec_data[start_index+1:end_index]))
end
return nothing
end
function serial_compute_barrier!(
value_vec::AbstractArray{T},
conicvec_data::AbstractArray{T},
conicvec_views::AbstractArray{Int}
) where {T}
@inbounds Threads.@threads for i = 1:size(conicvec_views, 1)
start_index = conicvec_views[i, 1]
end_index = conicvec_views[i, 2]
@assert (end_index - start_index) > 0 #length>=2
tmp = conicvec_data[start_index]^2
@inbounds for j = start_index+1:end_index
tmp -= conicvec_data[j]^2
end
value_vec[i] = -log(tmp)
end
return nothing
end
m = 4
n = 32*1024
T = CuArray # Set T to either CuArray (GPU version) or Array (CPU version)
Td = Float64
conicvec = MyStruct{T,Td}(m, n)
value_vec = T(zeros(Td,n))
if T == CuArray
@benchmark kernel = begin
CUDA.@sync @cuda threads=1024 blocks=cld(length(conicvec.views),1024) set_value!(conicvec.data,conicvec.views)
CUDA.@sync @cuda threads=1024 blocks=cld(length(conicvec.views),1024) compute_barrier!(value_vec,conicvec.data,conicvec.views)
end
else
@benchmark begin
serial_set_value!(conicvec.data,conicvec.views)
serial_compute_barrier!(value_vec,conicvec.data,conicvec.views)
end
end
When I was running the code above, it returns a warning:
‘’’
┌ Warning: Performing scalar indexing on task Task (runnable) @0x000000000a150010.
│ Invocation of setindex! resulted in scalar indexing of a GPU array.
│ This is typically caused by calling an iterating implementation of a method.
│ Such implementations do not execute on the GPU, but very slowly on the CPU,
│ and therefore are only permitted from the REPL for prototyping purposes.
│ If you did intend to index this array, annotate the caller with @allowscalar.
└ @ GPUArraysCore C:\Users\ddt00.julia\packages\GPUArraysCore\HaQcr\src\GPUArraysCore.jl:106
‘’’
I’m wondering whether this is an issue if I write my code like kernel programming above?
The mean of GPU one is about 60 us while the multithread one (CPU with 12 threads) is about 140 us. Though the GPU version seems 2x faster than the CPU-multithread, how can I verify the computation is implemented on a GPU rather than on a CPU?
In addition, is there any efficient way to add @allowscalar if I use the scalar indexing multiple times across my code? The tutorial seems to be very inefficient for my case.