How to precompile CUDA kernel itself?

Here’s a way to use a cached precompiled kernel. It assumes you have ptxas in your PATH.
In all likelihood there exist more appropriate methods in CUDA.jl. For example, I’m pretty sure cudacall should be preferred over my HostKernel approach, but I couldn’t get it to work. Check out

and

for some more information.


(…)/Test/src/Test/jl:

module Test

using PrecompileTools: @setup_workload, @compile_workload
using CUDA
using CUDA: i32

export mycopy!

function kernel!(A, B)
    i = threadIdx().x + blockDim().x * (blockIdx().x - 1i32)
    if checkbounds(Bool, A, i) && checkbounds(Bool, B, i)
        A[i] = B[i]
    end
    nothing
end

function get_func_name_from_ptx(ptx_path)
    # (Can be written more efficiently)
    ptx_code = read(ptx_path, String)
    return ptx_code[findfirst(Regex("// -- Begin function .*\n"), ptx_code)[begin + length("// -- Begin function "):end-1]]
    # There is a comment "// -- Begin function <func_name>" which ends with a newline
end

function get_compiled_kernel(cubin_path, name, kernel_tt)
    mdl = CuModule(read(cubin_path))
    func = CuFunction(mdl, name)
    return CUDA.HostKernel{typeof(kernel!), kernel_tt}(kernel!, func, CUDA.KernelState(CUDA.create_exceptions!(mdl), UInt32(0)))
end

function mycopy!(A, B, cache_dir="E:/Temp/kernel/")  # Adjust default
    len = length(A)
    @assert len === length(B)
    threads = 512
    blocks = cld(len, threads)
	
    ptx_path = joinpath(cache_dir, "kernel.ptx")
    cubin_path = joinpath(cache_dir, "kernel.cubin")
    func_name_path = joinpath(cache_dir, "name.txt")
    if !ispath(ptx_path)
        # Compile to disk
        mkpath(cache_dir)
        open(ptx_path, "w") do io
            @device_code_ptx io @cuda threads=threads blocks=blocks kernel!(A, B)
        end
        func_name = get_func_name_from_ptx(ptx_path)
        open(func_name_path, "w") do io
            write(io, func_name)
        end
        sm = CUDA.capability(CUDA.CuDevice(0))
        run(`ptxas -arch=sm_$(sm.major)$(sm.minor) --output-file $cubin_path $ptx_path`)  # compile ptx to cubin
    end
    # kernel.ptx, kernel.cubin and name.txt should now exist
    func_name = read(func_name_path, String)
    kernel_args = map(CUDA.cudaconvert, (A, B))  # (cf. @cuda macro)
    kernel_tt = Tuple{map(Core.Typeof, kernel_args)...}
    kern = get_compiled_kernel(cubin_path, func_name, kernel_tt)
    kern(A, B, threads=threads, blocks=blocks)
end

@setup_workload begin
    A = CUDA.zeros(100, 100, 100)
    B = CUDA.zeros(100, 100, 100)
	@compile_workload begin
		CUDA.@sync mycopy!(A, B)
	end
end

end

REPL output:

(@v1.10) pkg> activate Test
  Activating project at `(...)\Test`

(Test) pkg> ^C

julia> using Test
Precompiling Test
  1 dependency successfully precompiled in 11 seconds. 69 already precompiled.
[ Info: Precompiling Test [98d22206-c062-46eb-91c7-b4da6428f19f]

julia> using CUDA

julia> A = CUDA.zeros(100, 100, 100); B = CUDA.ones(100, 100, 100);

julia> @time CUDA.@sync mycopy!(A, B)
  0.001473 seconds (115 allocations: 1.007 MiB)

julia> @time CUDA.@sync mycopy!(A, B)
  0.000652 seconds (115 allocations: 1.007 MiB)

julia> CUDA.@allowscalar A[1]
1.0f0

You could make this a bit more efficient in subsequent runs by keeping mdl and func_name in memory and not rereading it from the disk every time. But in the grand scheme of things, I assume this will be negligible.