Dear all,
I have a problem about the performance of CUDA kernel function which loop on a large struct array. Loop on the struct array is neccesaary for me. The MWE is as follows:
using CUDA
using Adapt
using BenchmarkTools
const RealType = Float64
Base.@kwdef struct GPUElement
type::UInt32
order::UInt32
len0::RealType
end
Base.@kwdef struct Accelerator
line::Vector{GPUElement}
end
Base.@kwdef struct GPUAccelerator
line::CuDeviceVector{GPUElement,1}
end
Adapt.adapt_structure(to, s::Accelerator) = GPUAccelerator(
adapt(to, CuArray{GPUElement, 1}([adapt(to, e) for e in s.line]))
)
@inline invsqrt(v::T) where T = inv(sqrt(v))
@inline function _exact_drift!(elem::GPUElement, rin::AbstractArray{T,1}) where T
@inbounds begin
x = rin[1]
px = rin[2]
y = rin[3]
py = rin[4]
z = rin[5]
dp = rin[6]
end
p0=oneunit(dp)+dp
len0 = elem.len0
pnorm =invsqrt( p0^2-px^2-py^2 )
normL=len0*pnorm
@inbounds begin
rin[1] = x+ normL*px
rin[3] = y+ normL*py
rin[5] = z- normL*p0 + len0
end
nothing
end
@inline function track1!(acc::A, a::AbstractArray{T,1} ) where {A,T}
line = acc.line
for i in eachindex(line)
@inbounds _exact_drift!(line[i], a)
end
nothing
end
@inline function track2!(acc::A, a::AbstractArray{T,1} ) where {A,T}
line = acc.line
@inbounds elem::GPUElement = line[1]
for i in eachindex(line)
@inbounds _exact_drift!(elem, a)
end
nothing
end
function kernel1!(acc::GPUAccelerator, a::T ) where T
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
len::Int32 = Int32(size(a,2))
i = index
while i <= len
@inbounds track1!(acc, @view a[:,i])
i += stride
end
return nothing
end
function kernel2!(acc::GPUAccelerator, a::T ) where T
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
len::Int32 = Int32(size(a,2))
i = index
while i <= len
@inbounds track2!(acc, @view a[:,i])
i += stride
end
return nothing
end
function bench_gpu1!(elem,a)
kernel = @cuda launch=false kernel1!(elem,a)
config = launch_configuration(kernel.fun)
threads = 1024
# min(size(a,2), config.threads)
blocks = cld(size(a,2), threads)
CUDA.@sync kernel(elem, a; threads, blocks)
end
function bench_gpu2!(elem,a)
kernel = @cuda launch=false kernel2!(elem,a)
config = launch_configuration(kernel.fun)
threads = 1024
# min(size(a,2), config.threads)
blocks = cld(size(a,2), threads)
CUDA.@sync kernel(elem, a; threads, blocks)
end
b = zeros(Float64, (7,10240000))
b[2,:] .= 0.1;
a = CUDA.CuArray( b)
elem = GPUElement(1,1,10 )
line = Accelerator( fill(elem, 1000) )
CUDA.@profile trace=true bench_gpu1!(line, a)
CUDA.@profile trace=true bench_gpu2!(line, a)
tarck1! and kernel1! is what I need. track2 and kernel2! is just a test. The kernel1! (~800 ms) is much slower than kernel2! (~10 ms). The julia, CUDA and GPU version are as follows:
julia> versioninfo()
Julia Version 1.11.1
Commit 8f5b7ca12ad (2024-10-16 10:53 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 96 × Intel(R) Xeon(R) Platinum 8255C CPU @ 2.50GHz
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, cascadelake)
Threads: 1 default, 0 interactive, 1 GC (on 96 virtual cores)
Environment:
LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
julia> CUDA.versioninfo()
CUDA runtime 12.6, artifact installation
CUDA driver 12.6
NVIDIA driver 550.90.7
CUDA libraries:
- CUBLAS: 12.6.3
- CURAND: 10.3.7
- CUFFT: 11.3.0
- CUSOLVER: 11.7.1
- CUSPARSE: 12.5.4
- CUPTI: 2024.3.2 (API 24.0.0)
- NVML: 12.0.0+550.90.7
Julia packages:
- CUDA: 5.5.2
- CUDA_Driver_jll: 0.10.3+0
- CUDA_Runtime_jll: 0.15.4+0
Toolchain:
- Julia: 1.11.1
- LLVM: 16.0.6
1 device:
0: Tesla V100-SXM2-32GB (sm_70, 30.819 GiB / 32.000 GiB available)
Does It seem that the performance is limited by the speed of access memory? What can I do to improve the performance.
Thanks a lot!
Tao