I compared array-of-struct and struct-of-array and seems struct-of-array is faster.
struct MyStr{T}
Δxˣ::T
Δxʸ::T
Δvˣ::T
Δvʸ::T
ρᵢ::T
ρⱼ::T
end
function pairs_calk!(buff, pairs; minthreads::Int = 1024)
gpukernel = @cuda launch=false kernel_pairs_calk!(buff, pairs)
config = launch_configuration(gpukernel.fun)
Nx = length(pairs)
maxThreads = config.threads
Tx = min(minthreads, maxThreads, Nx)
Bx = cld(Nx, Tx)
CUDA.@sync gpukernel(buff, pairs; threads = Tx, blocks = Bx)
end
function kernel_pairs_calk!(buff, pairs)
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
if index <= length(pairs)
buff[1][index] = (1.2, 2.3)
buff[2][index] = (4.5, 6.7)
buff[3][index] = 0.1
buff[4][index] = 0.7
end
return nothing
end
function pairs_calk2!(buff, pairs; minthreads::Int = 1024)
gpukernel = @cuda launch=false maxregs=64 kernel_pairs_calk2!(buff, pairs)
config = launch_configuration(gpukernel.fun)
Nx = length(pairs)
maxThreads = config.threads
Tx = min(minthreads, maxThreads, Nx)
Bx = cld(Nx, Tx)
CUDA.@sync gpukernel(buff, pairs; threads = Tx, blocks = Bx)
end
function kernel_pairs_calk2!(buff, pairs)
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
if index <= length(buff)
buff[index] = MyStr{Float64}(0.1, 0.2, 0.3, 0.4, 0.9, 1.2)
end
return nothing
end
PN = 10000
buff = (CUDA.fill((zero(Float64), zero(Float64)), PN), CUDA.fill((zero(Float64), zero(Float64)), PN), CUDA.zeros(Float64, PN), CUDA.zeros(Float64, PN))
pairs = CUDA.zeros(PN )
@benchmark pairs_calk!($buff,$pairs ; minthreads = 1024)
# ~ 20.639 μs
buff2 = CuArray{MyStr{Float64}}(undef, PN)
@benchmark pairs_calk2!($buff2,$pairs ; minthreads = 1024)
# ~ 25.068 μs
Is it expected behavior?