The equivalent code in Julia would be very similar:
function nrm2(x)
# calculate the norm of the vector x
sq_norm = 0.0f0
for j in eachindex(x) # or j = 1:length(x)
@inbounds sq_norm += x[j] * x[j]
end
return sqrt(sq_norm)
end
function soc_proj(sol)
@inbounds norm = nrm2(@view(sol[2:end]))
@inbounds t = sol[1]
if norm <= -t
for j in eachindex(sol)
@inbounds sol[j] = 0.0f0;
end
elseif norm <= t
# Do nothing, continue with the next iteration
else
c = (1.0f0 + t / norm) / 2.0f0;
@inbounds sol[1] = norm * c;
for j = 2:length(sol)
@inbounds sol[j] = sol[j] * c;
end
end
end
function soc_proj_kernel(arr, gpu_head_start, ns, blkNum)
i = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x;
# one thread per cone projection
if (i <= blkNum)
@inbounds n = ns[i]
@inbounds @views sol = (arr[gpu_head_start[i] : gpu_head_start[i] + n - 1i32])
soc_proj(sol)
end
return
end
function soc_proj_const_scale_all!(data, slice_lengths, slice_starts)
# data::CuVector{Float32}, slice_lengths::CuVector{Int32}, slice_starts::CuVector{Int32}
nb_threads = 256
@cuda blocks=cld(length(data), nb_threads) threads=nb_threads soc_proj_kernel(data, slice_starts, slice_lengths, length(slice_starts))
end
Basically, use 1-indexing and replace the pointer arithmetic with @views
. (I’ve also switched to Float32
, but maybe your beefy GPU handles double precision well.) The @inbounds
for removing bound checks are not necessary, but they can increase performance. (Alternatively, run Julia with --check-bounds=no
.)
This is not the best way to get an accurate execution time measurement, as you only take a single sample (which might be affected by random ‘noise’, and also by compilation time). Instead, prefer to use BenchmarkTools.jl, Chairmarks.jl, or CUDA.jl specific methods such as CUDA.@bprofile
.
If I compare all methods we discussed, your last approach turns out to be fastest. (I’m guessing my previous suggestion allocates too much.) Here are my benchmark results:
GPU Device Name: NVIDIA GeForce RTX 3070
GPU Compute Capability: 8.6.0
UNIFORM DATA: 10_000 slices of length 10_000
--------------------------------------------
GPU v1 (@threads, kernel launch per slice)
BenchmarkTools.Trial: 2 samples with 1 evaluation.
Range (min … max): 2.468 s … 2.477 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 2.473 s ┊ GC (median): 0.00%
Time (mean ± σ): 2.473 s ± 6.297 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
2.47 s Histogram: frequency by time 2.48 s <
Memory estimate: 16.74 MiB, allocs estimate: 681105.
GPU v2 (stack, vectorised code)
BenchmarkTools.Trial: 14 samples with 1 evaluation.
Range (min … max): 14.537 ms … 83.696 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 29.550 ms ┊ GC (median): 0.00%
Time (mean ± σ): 36.323 ms ± 19.195 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▄█ ▁
▆▁▁▁▁▁▁▁▁▁▁██▆▆▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁▁▁▆ ▁
14.5 ms Histogram: frequency by time 83.7 ms <
Memory estimate: 189.73 KiB, allocs estimate: 3685.
GPU v3 (GPU threads corresponding to data elements)
BenchmarkTools.Trial: 35 samples with 1 evaluation.
Range (min … max): 25.561 ms … 29.311 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 26.590 ms ┊ GC (median): 0.00%
Time (mean ± σ): 26.780 ms ± 688.798 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁█ ▄▁█▁▄▁ ▁
▆▁▁▁▁▆▁▁▁▁▁██▆██████▆▆▆▆▁▁▆▁▁▁▁▁▁▆▁▆▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆ ▁
25.6 ms Histogram: frequency by time 29.3 ms <
Memory estimate: 46.73 KiB, allocs estimate: 1733.
GPU v4 (GPU threads corresponding to slices)
BenchmarkTools.Trial: 39 samples with 1 evaluation.
Range (min … max): 10.042 ms … 11.880 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 10.833 ms ┊ GC (median): 0.00%
Time (mean ± σ): 10.842 ms ± 616.806 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
██ ▁ ▁ ▁ ▁ █
██▆▆▆▁▁▁▁▁▁▆▆▁▁▆▆▁▁▆█▆▁▁▁▆▆▁▁▆▆▁▁▆▁█▁▁▁▁█▁▁▆▁▁▆▁█▁▁█▁▁▁▆▁▆▁▆ ▁
10 ms Histogram: frequency by time 11.9 ms <
Memory estimate: 912 bytes, allocs estimate: 39.
CPU v1 (@threads, CPU threads corresponding to slices)
BenchmarkTools.Trial: 31 samples with 1 evaluation.
Range (min … max): 25.889 ms … 29.299 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 26.575 ms ┊ GC (median): 0.00%
Time (mean ± σ): 26.801 ms ± 897.141 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃ ▃██ ▃ ▃▃▃
█▁▇███▁█▇▁▁▇███▁▇▇▁▁▁▇▁▁▁▁▇▁▁▇▁▁▁▁▁▇▁▇▁▁▁▁▁▇▁▁▁▁▁▁▁▁▁▁▁▇▁▁▁▇ ▁
25.9 ms Histogram: frequency by time 29.3 ms <
Memory estimate: 4.86 KiB, allocs estimate: 41.
NON-UNIFORM DATA: 10_000 slices of length between 1 and 20_001
--------------------------------------------------------------
GPU v1 (@threads, kernel launch per slice)
BenchmarkTools.Trial: 2 samples with 1 evaluation.
Range (min … max): 2.412 s … 2.500 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 2.456 s ┊ GC (median): 0.00%
Time (mean ± σ): 2.456 s ± 62.486 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
2.41 s Histogram: frequency by time 2.5 s <
Memory estimate: 16.74 MiB, allocs estimate: 681059.
GPU v3 (GPU threads corresponding to data elements)
BenchmarkTools.Trial: 35 samples with 1 evaluation.
Range (min … max): 25.111 ms … 27.938 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 25.557 ms ┊ GC (median): 0.00%
Time (mean ± σ): 25.969 ms ± 791.694 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁ █ █ ▄ ▁ ▁ ▁
█▆█▁▆█▆█▆▆▆▆▁▁▆▁▁▁▁▁▁▁▁▁▁▁▁▆▆▁█▁▆█▁█▆▁▆▁▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁▆ ▁
25.1 ms Histogram: frequency by time 27.9 ms <
Memory estimate: 46.73 KiB, allocs estimate: 1733.
GPU v4 (GPU threads corresponding to slices)
BenchmarkTools.Trial: 39 samples with 1 evaluation.
Range (min … max): 9.728 ms … 12.006 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 10.555 ms ┊ GC (median): 0.00%
Time (mean ± σ): 10.434 ms ± 448.279 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▂ ▂▂█ ▅
█▅▅▅▅▅▅▅▁▁▁█▁▁▅▅▁▅▁▁████▅█▅▁▅▅▅▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅ ▁
9.73 ms Histogram: frequency by time 12 ms <
Memory estimate: 912 bytes, allocs estimate: 39.
CPU v1 (@threads, CPU threads corresponding to slices)
BenchmarkTools.Trial: 31 samples with 1 evaluation.
Range (min … max): 25.818 ms … 29.608 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 26.733 ms ┊ GC (median): 0.00%
Time (mean ± σ): 27.064 ms ± 979.965 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁ ▁ █ ▁ ▁▁ ▁
▆▁▆▁▆▁█▆█▁▁▆█▆█▆▆▁▁██▁▆▁▁▁▁▆█▆▁▁▁▁▁▁▁▁▁▁▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▆▆ ▁
25.8 ms Histogram: frequency by time 29.6 ms <
Memory estimate: 4.86 KiB, allocs estimate: 41.
The CPU is an i7-7700K.
Full code
using CUDA
using CUDA: i32
using LinearAlgebra
using Base.Threads
using BenchmarkTools
function generate_data(nb_slices=1000, min_slice_length=500, max_slice_length=1500)
slice_lengths_cpu = [rand(min_slice_length:max_slice_length) for _ in 1:nb_slices]
slice_lengths = CuArray(slice_lengths_cpu)
slice_lengths_cs_cpu = cumsum(slice_lengths_cpu)
data = CUDA.rand(slice_lengths_cs_cpu[end]) # (or sum(slice_lengths_cpu) )
data_cpu = Array(data)
slices = [i == 1 ? @view(data[1:slice_lengths_cs_cpu[1]]) : @view(data[slice_lengths_cs_cpu[i - 1] + 1 : slice_lengths_cs_cpu[i]]) for i in eachindex(slice_lengths_cpu)]
slices_cpu = [i == 1 ? @view(data_cpu[1:slice_lengths_cs_cpu[1]]) : @view(data_cpu[slice_lengths_cs_cpu[i - 1] + 1 : slice_lengths_cs_cpu[i]]) for i in eachindex(slice_lengths_cpu)]
slice_starts_cpu = Array{Int32}(undef, length(slices_cpu))
slice_starts_cpu[1] = 1i32
slice_starts_cpu[2:end] .= @view(slice_lengths_cs_cpu[1:end-1]) .+ 1i32
slice_starts = CuArray(slice_starts_cpu)
return data, slices, slice_lengths, slice_starts, data_cpu, slices_cpu, slice_lengths_cpu, slice_starts_cpu
end
function soc_proj_const_scale!(sol::AbstractVector)
x2end = @view sol[2:end]
t = sol[1]
nrm_x = norm(x2end)
if nrm_x <= -t
fill!(sol, 0.0)
return
elseif nrm_x <= t
return
else
c = (1.0 + t/nrm_x)/2.0
sol[1] = nrm_x * c
x2end .*= c
return
end
end
function soc_proj_const_scale!(sol::CuVector)
x2end = @view sol[2:end]
CUDA.@allowscalar t = sol[1]
nrm_x = CUDA.norm(x2end)
if nrm_x <= -t
CUDA.fill!(sol, 0.0)
return
elseif nrm_x <= t
return
else
c = (1.0 + t/nrm_x)/2.0
CUDA.@allowscalar sol[1] = nrm_x * c
x2end .*= c
return
end
end
function soc_proj_const_scale_all!(slices)
@threads for s in slices
soc_proj_const_scale!(s)
end
end
function soc_proj_const_scale_all!(solm::CuMatrix)
x2ends = @view solm[2:end, :]
ts = @view solm[1, :]
nrm_xs = sqrt.(mapreduce(t -> t^2, +, x2ends, dims=1)) |> vec
solm[:, nrm_xs .<= -ts] .= 0
b = nrm_xs .> ts
ts_b = ts[b] # No view as cs_b line below will error
nrm_xs_b = @view nrm_xs[b]
cs_b = (1.f0 .+ ts_b ./ nrm_xs_b) ./ 2
# Note that on the GPU you want to use 32-bit floats.
# (If solm uses Float64, 1.f0 will be promoted to Float64 anyway.)
solm[1, b] .= nrm_xs_b .* cs_b
solm[2:end, b] .*= cs_b' # e.g. 999 x 123 .* 1 x 123
end
function soc_proj_const_scale_all!(data, slice_lengths)
# data::CuVector{Float32}, slice_lengths::CuVector{Int32}
data_sq = data .^ 2
cs_data_sq = accumulate(+, data_sq)
slice_ends = accumulate(+, slice_lengths) # slices[i][end] == data[slice_ends[i]]
slice_starts = CuVector{Int32}(undef, length(slice_lengths)) # slices[i][begin] == data[slice_starts[i]]
CUDA.@allowscalar slice_starts[1] = 1i32
@inbounds slice_starts[2:end] .= @view(slice_ends[1:end-1]) .+ 1i32
norms = CuVector{Float32}(undef, length(slice_lengths)) # When we're done: norms[i] == norm(slices[i][2:end]) (possibly some degree of numerical errors though)
@inbounds CUDA.@allowscalar norms[1] = sqrt.(cs_data_sq[slice_ends[1]] - data_sq[1])
@inbounds @views norms[2:end] .= sqrt.(cs_data_sq[slice_ends[2:end]] .- cs_data_sq[slice_ends[1:end-1]] .- data_sq[slice_starts[2:end]])
data_idx_is_slice_start = CUDA.zeros(Int32, length(data))
@inbounds data_idx_is_slice_start[slice_starts] .= 1i32
data_slice_indices = accumulate(+, data_idx_is_slice_start) # If data index i belongs to slice j, we have data_slice_indices[i] == j
@inbounds ts = @view data[slice_starts]
# Find optimal number of threads per block
kernel = @cuda launch=false soc_proj_const_scale_kernel!(data, ts, norms, data_idx_is_slice_start, data_slice_indices)
config = launch_configuration(kernel.fun)
@cuda blocks=max(cld(length(data), config.threads), config.blocks) threads=config.threads soc_proj_const_scale_kernel!(data, ts, norms, data_idx_is_slice_start, data_slice_indices)
# The garbage collector is a bit too lazy in our tight benchmark loop, so clean up ourselves.
CUDA.unsafe_free!(data_sq)
CUDA.unsafe_free!(cs_data_sq)
CUDA.unsafe_free!(slice_ends)
CUDA.unsafe_free!(slice_starts)
CUDA.unsafe_free!(norms)
CUDA.unsafe_free!(data_idx_is_slice_start)
CUDA.unsafe_free!(data_slice_indices)
end
function soc_proj_const_scale_kernel!(data, ts, norms, data_idx_is_slice_start, data_slice_indices)
data_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
while data_idx <= length(data)
@inbounds slice_idx = data_slice_indices[data_idx]
nrm = norms[slice_idx]
inv_nrm = 1.f0 / nrm
@inbounds t = ts[slice_idx]
if nrm <= -t
@inbounds data[data_idx] = 0
elseif nrm > t
c = (1.f0 + t * inv_nrm) * 0.5f0
if data_idx_is_slice_start[data_idx] == 1i32
@inbounds data[data_idx] = nrm * c
else
@inbounds data[data_idx] *= c
end
end
data_idx += blockDim().x * gridDim().x
end
return
end
function soc_proj_const_scale_all!(data, slice_lengths, slice_starts)
# data::CuVector{Float32}, slice_lengths::CuVector{Int32}, slice_starts::CuVector{Int32}
nb_threads = 256
@cuda blocks=cld(length(data), nb_threads) threads=nb_threads soc_proj_kernel(data, slice_starts, slice_lengths, length(slice_starts))
end
function nrm2(x)
# calculate the norm of the vector x
sq_norm = 0.0f0
for j in eachindex(x) # or j = 1:length(x)
@inbounds sq_norm += x[j] * x[j]
end
return sqrt(sq_norm)
end
function soc_proj(sol)
@inbounds norm = nrm2(@view(sol[2:end]))
@inbounds t = sol[1]
if norm <= -t
for j in eachindex(sol)
@inbounds sol[j] = 0.0f0;
end
elseif norm <= t
# Do nothing, continue with the next iteration
else
c = (1.0f0 + t / norm) / 2.0f0;
@inbounds sol[1] = norm * c;
for j = 2:length(sol)
@inbounds sol[j] = sol[j] * c;
end
end
end
function soc_proj_kernel(arr, gpu_head_start, ns, blkNum)
i = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x;
# one thread per cone projection
if (i <= blkNum)
@inbounds n = ns[i]
@inbounds @views sol = (arr[gpu_head_start[i] : gpu_head_start[i] + n - 1i32])
soc_proj(sol)
end
return
end
device = CUDA.device()
println("GPU Device Name: ", CUDA.name(device))
println("GPU Compute Capability: ", CUDA.capability(device))
println("\n")
println("UNIFORM DATA: 10_000 slices of length 10_000")
println("-" ^ 44)
println("GPU v1 (@threads, kernel launch per slice)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(slices)) setup=(gen_data_tuple=generate_data(10_000, 10_000, 10_000); slices=gen_data_tuple[2]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))
println("\n\nGPU v2 (stack, vectorised code)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(stacked)) setup=(gen_data_tuple=generate_data(10_000, 10_000, 10_000); slices=gen_data_tuple[2]; stacked=stack(slices)) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4)); CUDA.unsafe_free!(stacked)))
println("\n\nGPU v3 (GPU threads corresponding to data elements)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(data, slice_lengths)) setup=(gen_data_tuple=generate_data(10_000, 10_000, 10_000); data=gen_data_tuple[1]; slice_lengths=gen_data_tuple[3]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))
println("\n\nGPU v4 (GPU threads corresponding to slices)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(data, slice_lengths, slice_starts)) setup=(gen_data_tuple=generate_data(10_000, 10_000, 10_000); data=gen_data_tuple[1]; slice_lengths=gen_data_tuple[3]; slice_starts=gen_data_tuple[4]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))
println("\n\nCPU v1 (@threads, CPU threads corresponding to slices)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(slices)) setup=(gen_data_tuple=generate_data(10_000, 10_000, 10_000); slices=gen_data_tuple[6]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))
println("\n\n")
println("NON-UNIFORM DATA: 10_000 slices of length between 1 and 20_001")
println("-" ^ 62)
println("GPU v1 (@threads, kernel launch per slice)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(slices)) setup=(gen_data_tuple=generate_data(10_000, 1, 20_001); slices=gen_data_tuple[2]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))
println("\n\nGPU v3 (GPU threads corresponding to data elements)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(data, slice_lengths)) setup=(gen_data_tuple=generate_data(10_000, 1, 20_001); data=gen_data_tuple[1]; slice_lengths=gen_data_tuple[3]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))
println("\n\nGPU v4 (GPU threads corresponding to slices)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(data, slice_lengths, slice_starts)) setup=(gen_data_tuple=generate_data(10_000, 1, 20_001); data=gen_data_tuple[1]; slice_lengths=gen_data_tuple[3]; slice_starts=gen_data_tuple[4]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))
println("\n\nCPU v1 (@threads, CPU threads corresponding to slices)")
display(@benchmark (CUDA.@sync soc_proj_const_scale_all!(slices)) setup=(gen_data_tuple=generate_data(10_000, 1, 20_001); slices=gen_data_tuple[6]) teardown=(CUDA.unsafe_free!.(gen_data_tuple[i] for i in (1, 3, 4))))