Thanks. Such diagrams are always useful.
But regarding the code, could you provide it in a form we can just run it by copy-pasting? I.e. something of the form
using CUDA, BenchmarkTools
(...)
const Ti = Int32
const Tf = Float64
const Δ::Tf = Tf(1e-2) # (we don't know what your df contains)
(...)
function kernel_comp_v_noshmem!(...)
(...)
end
function kernel_comp_v!(...)
(...)
end
v = (...) # e.g. CUDA.rand(...)
(...)
display(@benchmark CUDA.@sync begin
kernel_comp_v_noshmem!!($v_temp, $v, $F; threads = block_dim, blocks = grid_dim)
end)
display(@benchmark CUDA.@sync begin
comp_v!($v_temp, $v, $F; threads = block_dim, blocks = grid_dim)
end)
See also (point 4 in)