Thank you very much for your explanations and the related code examples. I am a beginner in Julia, and I may not be very familiar with many acceleration techniques in Julia. In the end, I chose to write a kernel in CUDA, rewriting the relevant norm calculation and simple vector BLAS operations myself. These vector-related BLAS operations are not difficult. Then I compiled them into PTX and subsequently called them using Julia. Finally, my test results were almost identical to those on the CPU, and the speed was even faster when the problem dimensions and the number of blocks were larger.
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#define positive_zero 1e-20
#define negative_zero -1e-20
#define proj_rel_tol 1e-14
#define proj_abs_tol 1e-16
// n is the length of the vector, including the first element
// len is the length of the vector, not including the first element or the top two elements
// BLAS functions
__device__ double nrm2(long *n, double *x)
{
// calculate the norm of the vector x
double norm = 0.0;
for (long j = 0; j < *n; ++j)
{
norm += x[j] * x[j];
}
return sqrt(norm);
}
// END OF BLAS FUNCTIONS
__device__ void soc_proj(double *sol, long *n)
{
long len = *n - 1;
double norm = nrm2(&len, &sol[1]);
double t = sol[0];
if (norm <= -t)
{
for (long j = 0; j < *n; ++j)
{
sol[j] = 0.0;
}
}
else if (norm <= t)
{
// Do nothing, continue with the next iteration
}
else
{
double c = (1.0 + t / norm) / 2.0;
sol[0] = norm * c;
for (long j = 1; j < *n; ++j)
{
sol[j] = sol[j] * c;
}
}
}
extern "C" __global__ void
soc_proj_kernel(double *arr, const long *gpu_head_start, const long *ns, int blkNum)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// one thread per cone projection
if (i < blkNum)
// if (i == 0)
{
long n = ns[i];
double *sol = arr + gpu_head_start[i];
soc_proj(sol, &n);
}
}
using Pkg
Pkg.activate("test_cpu_env")
using Random
using CUDA
using LinearAlgebra
const rpdhg_float = Float64
function soc_proj!(sol::AbstractVector{rpdhg_float})
t = sol[1]
x = @view sol[2:end]
nrm_x = LinearAlgebra.norm(x)
if nrm_x <= -t
sol .= 0.0
elseif nrm_x <= t
return
else
c = (1.0 + t/nrm_x)/2.0
sol[1] = nrm_x * c
x .= x * c
end
end
# random seed
for seed in 1:2
Random.seed!(seed)
## cpu version
blkNum = 10000
nEveryBlk = 10000
n = nEveryBlk * blkNum
vec_cpu_before_proj = rand(n)
vec_cpu_before_proj_copy = deepcopy(vec_cpu_before_proj)
len_blk = Vector{Int64}([nEveryBlk for _ in 1:blkNum])
cum_sum_len_blk = cumsum(len_blk)
cum_sum_len_blk .-= len_blk[1]
@assert sum(len_blk) == n
sub_vec_list = []
for i in 1:length(len_blk)
sub_vec = @view vec_cpu_before_proj[sum(len_blk[1:i-1])+1:sum(len_blk[1:i])]
push!(sub_vec_list, sub_vec)
end
time_start = time()
for i in 1:length(len_blk)
soc_proj!(sub_vec_list[i])
end
time_end = time()
println("cpu time: ", time_end - time_start)
ptx_path = "./code/nvcc_test/answer/soc_proj.ptx"
func_name = "soc_proj_kernel"
mod = CuModule(read(ptx_path))
kernel = CuFunction(mod, func_name)
N = n
nThread = 256
nBlock = Int64(ceil((N + nThread - 1) / nThread))
adevice = CuArray(vec_cpu_before_proj_copy)
gpu_head_start = CuArray(Vector{Int64}([cum_sum_len_blk...]))
gpu_ns = CuArray(Vector{Int64}(len_blk))
blkNum = length(len_blk)
time_start = time()
CUDA.cudacall(
kernel,
( CuPtr{Float64}, CuPtr{Int64}, CuPtr{Int64}, Int64),
adevice, gpu_head_start, gpu_ns, blkNum;
blocks = nBlock, threads = nThread
)
# sync
CUDA.synchronize()
time_end = time()
println("gpu time: ", time_end - time_start)
# println("adevice: ", adevice)
result = Array(adevice)
println("result ≈ vec_cpu_before_proj in norm: ", norm(result - vec_cpu_before_proj, Inf))
@assert norm(result - vec_cpu_before_proj, Inf) < 1e-9
end
# one case result
cpu time: 0.19637703895568848
gpu time: 0.01563096046447754
result ≈ vec_cpu_before_proj in norm: 1.4921397450962104e-13