Here’s a way to use a cached precompiled kernel. It assumes you have ptxas
in your PATH.
In all likelihood there exist more appropriate methods in CUDA.jl. For example, I’m pretty sure cudacall
should be preferred over my HostKernel
approach, but I couldn’t get it to work. Check out
and
for some more information.
(…)/Test/src/Test/jl:
module Test
using PrecompileTools: @setup_workload, @compile_workload
using CUDA
using CUDA: i32
export mycopy!
function kernel!(A, B)
i = threadIdx().x + blockDim().x * (blockIdx().x - 1i32)
if checkbounds(Bool, A, i) && checkbounds(Bool, B, i)
A[i] = B[i]
end
nothing
end
function get_func_name_from_ptx(ptx_path)
# (Can be written more efficiently)
ptx_code = read(ptx_path, String)
return ptx_code[findfirst(Regex("// -- Begin function .*\n"), ptx_code)[begin + length("// -- Begin function "):end-1]]
# There is a comment "// -- Begin function <func_name>" which ends with a newline
end
function get_compiled_kernel(cubin_path, name, kernel_tt)
mdl = CuModule(read(cubin_path))
func = CuFunction(mdl, name)
return CUDA.HostKernel{typeof(kernel!), kernel_tt}(kernel!, func, CUDA.KernelState(CUDA.create_exceptions!(mdl), UInt32(0)))
end
function mycopy!(A, B, cache_dir="E:/Temp/kernel/") # Adjust default
len = length(A)
@assert len === length(B)
threads = 512
blocks = cld(len, threads)
ptx_path = joinpath(cache_dir, "kernel.ptx")
cubin_path = joinpath(cache_dir, "kernel.cubin")
func_name_path = joinpath(cache_dir, "name.txt")
if !ispath(ptx_path)
# Compile to disk
mkpath(cache_dir)
open(ptx_path, "w") do io
@device_code_ptx io @cuda threads=threads blocks=blocks kernel!(A, B)
end
func_name = get_func_name_from_ptx(ptx_path)
open(func_name_path, "w") do io
write(io, func_name)
end
sm = CUDA.capability(CUDA.CuDevice(0))
run(`ptxas -arch=sm_$(sm.major)$(sm.minor) --output-file $cubin_path $ptx_path`) # compile ptx to cubin
end
# kernel.ptx, kernel.cubin and name.txt should now exist
func_name = read(func_name_path, String)
kernel_args = map(CUDA.cudaconvert, (A, B)) # (cf. @cuda macro)
kernel_tt = Tuple{map(Core.Typeof, kernel_args)...}
kern = get_compiled_kernel(cubin_path, func_name, kernel_tt)
kern(A, B, threads=threads, blocks=blocks)
end
@setup_workload begin
A = CUDA.zeros(100, 100, 100)
B = CUDA.zeros(100, 100, 100)
@compile_workload begin
CUDA.@sync mycopy!(A, B)
end
end
end
REPL output:
(@v1.10) pkg> activate Test
Activating project at `(...)\Test`
(Test) pkg> ^C
julia> using Test
Precompiling Test
1 dependency successfully precompiled in 11 seconds. 69 already precompiled.
[ Info: Precompiling Test [98d22206-c062-46eb-91c7-b4da6428f19f]
julia> using CUDA
julia> A = CUDA.zeros(100, 100, 100); B = CUDA.ones(100, 100, 100);
julia> @time CUDA.@sync mycopy!(A, B)
0.001473 seconds (115 allocations: 1.007 MiB)
julia> @time CUDA.@sync mycopy!(A, B)
0.000652 seconds (115 allocations: 1.007 MiB)
julia> CUDA.@allowscalar A[1]
1.0f0
You could make this a bit more efficient in subsequent runs by keeping mdl
and func_name
in memory and not rereading it from the disk every time. But in the grand scheme of things, I assume this will be negligible.