I tried going through and annotating the cufunction to see why it was taking so long to launch the GPU kernel and it seems like the call to methodinstance
and cached_compilation
takes quite some time. At this point, I’m realizing I’m way too deep down this rabbit hole. So, I’m going to sketch it up to compilation overheads and move on. Thanks!
function cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
cuda = active_state()
Base.@lock cufunction_lock begin
# compile the function
cache = compiler_cache(cuda.context)
NVTX.@mark "compiler_cache"
source = methodinstance(F, tt)
NVTX.@mark "methodinstance"
config = compiler_config(cuda.device; kwargs...)::CUDACompilerConfig
NVTX.@mark "compiler_config"
fun = GPUCompiler.cached_compilation(cache, source, config, compile, link)
NVTX.@mark "cached_compilation"
# create a callable object that captures the function instance. we don't need t
o think
# about world age here, as GPUCompiler already does and will return a different
object
key = (objectid(source), hash(fun), f)
NVTX.@mark "hash"
kernel = get(_kernel_instances, key, nothing)
NVTX.@mark "get"
if kernel === nothing
# create the kernel state object
state = KernelState(create_exceptions!(fun.mod), UInt32(0))
NVTX.@mark "kernelstate"
kernel = HostKernel{F,tt}(f, fun, state)
NVTX.@mark "hostkernel"
_kernel_instances[key] = kernel
end
NVTX.@mark "after if condition"
return kernel::HostKernel{F,tt}
end
end