Hi,
When the CPU performs compute-intensive tasks, it never yields, so the Threads.@spawn command assigns CPU and GPU operations to different threads to achieve overlap and shorten the overall running time. However, can the function of the CPU be a multi-process function?
I conducted the following tests and found that when the CPU function is a multi-process function, the overlapping effect is not good, and only a part of the time is overlapped.
-
My doubt is whether using the Threads.@spawn command to perform GPU and CPU overlapping operations will achieve good results only when the CPU function is serial?
-
If the CPU function is a multi-process parallel function at this time, do some optimization operations need to be performed to achieve an excellent overlapping effect?
Thanks!
using CUDA
using BenchmarkTools
using Distributed
addprocs(2)
ngpu = 10000
ncpu = 3000
Acpu = rand(Float64,ncpu,ncpu)
Bcpu = rand(Float64,ncpu)
Agpu = CUDA.rand(Float64,ngpu,ngpu)
Bgpu = CUDA.rand(Float64,ngpu)
Cgpu1 = CUDA.zeros(Float64,ngpu)
#GPU function
function MatrixVectorMul!(Agpu,Bgpu,Cgpu)
it = (blockIdx().x-1) * blockDim().x + threadIdx().x
num = size(Agpu,1)
if it > num
return
end
for i = 1:num
Cgpu[it] = Cgpu[it] + Agpu[it,i]*Bgpu[i]
end
return
end
#CPU multi-process function
function MatrixVectorMulcpumultiprocess(Acpu,Bcpu)
w = workers()
chunknum = length(w)
chunklen = cld(size(Acpu,1), chunknum)
Ccpupart1 = @spawnat w[1] MatrixVectorMulcpumultiprocesssub(Acpu[:,1:chunklen],Bcpu[1:chunklen])
Ccpupart2 = @spawnat w[2] MatrixVectorMulcpumultiprocesssub(Acpu[:,chunklen+1:end],Bcpu[chunklen+1:end])
Ccpu = fetch(Ccpupart1) + fetch(Ccpupart2)
return Ccpu
end
@everywhere function MatrixVectorMulcpumultiprocesssub(Acpu,Bcpu)
num1 = size(Acpu,2)
num2 = size(Acpu,1)
Ccpu = zeros(Float64,num2)
for i = 1:num1
for j = 1:num2
Ccpu[j] = Ccpu[j] + Acpu[j,i]*Bcpu[i]
end
end
return Ccpu
end
#the overlapping operation
@btime @sync begin
# GPU part running time: 4.132s
Threads.@spawn begin
for i = 1:1000
CUDA.@sync @cuda(
threads = 256,
blocks = cld(size(Agpu,1),256),
MatrixVectorMul!(Agpu,Bgpu,Cgpu1)
)
end
end
# CPU part running time:3.022s
Threads.@spawn begin
for i = 1:30
Ccpu = MatrixVectorMulcpumultiprocess(Acpu,Bcpu)
end
end
end
#overall running time: 6.138s
#overlapping time: (4.132s+3.022s) - 6.138s = 1.016s
#the overlapping effect is not good
#CUDA: 4.4.0
#julia: 1.8.5