Delays shown in Nsight Systems between HtoD memcopy and kernel launch when using CUDA.jl

I tried going through and annotating the cufunction to see why it was taking so long to launch the GPU kernel and it seems like the call to methodinstance and cached_compilation takes quite some time. At this point, I’m realizing I’m way too deep down this rabbit hole. So, I’m going to sketch it up to compilation overheads and move on. Thanks!

function cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
    cuda = active_state()

    Base.@lock cufunction_lock begin
        # compile the function
        cache = compiler_cache(cuda.context)
        NVTX.@mark "compiler_cache"
        source = methodinstance(F, tt)
        NVTX.@mark "methodinstance"
        config = compiler_config(cuda.device; kwargs...)::CUDACompilerConfig
        NVTX.@mark "compiler_config"
        fun = GPUCompiler.cached_compilation(cache, source, config, compile, link)
        
        NVTX.@mark "cached_compilation"
        # create a callable object that captures the function instance. we don't need t
o think 
        # about world age here, as GPUCompiler already does and will return a different
 object 
        key = (objectid(source), hash(fun), f)
        NVTX.@mark "hash"
        kernel = get(_kernel_instances, key, nothing)
        NVTX.@mark "get"
        if kernel === nothing
            # create the kernel state object
            state = KernelState(create_exceptions!(fun.mod), UInt32(0))
            NVTX.@mark "kernelstate"
            
            kernel = HostKernel{F,tt}(f, fun, state)
            NVTX.@mark "hostkernel"
            _kernel_instances[key] = kernel
        end 
        NVTX.@mark "after if condition"
        return kernel::HostKernel{F,tt}
    end 
end