I will show the complete example and versioninfo()
and CUDA.versioninfo()
The same trend as @eldee was observed; The first run is certainly faster when precompiled, but still much slower than the second.
TLDR: There is hope, but there are still man sharp corners.
Great!! I hope the improvement come true sooner.
#] generate Startup
#] develope ./Startup
module Startup
using CUDA
using PrecompileTools
export CUDA
export copy_with_precompile!
function copy_with_precompile!(A, B)
len = length(A)
@assert len === length(B)
function kernel()
i = threadIdx().x + blockDim().x * (blockIdx().x - 1)
if checkbounds(Bool, A, i) && checkbounds(Bool, B, i)
A[i] = B[i]
end
nothing
end
threads = 512
blocks = cld(len, threads)
@cuda threads = threads blocks = blocks kernel()
end
@setup_workload begin
A = CUDA.zeros(100, 100, 100)
B = CUDA.zeros(100, 100, 100)
@compile_workload begin
CUDA.@sync copy_with_precompile!(A, B)
end
end
end
julia> using Startup
julia> A = CUDA.zeros(100, 100, 100);
julia> B = CUDA.zeros(100, 100, 100);
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
1.411800 seconds (880.65 k CPU allocations: 46.825 MiB, 8.04% gc time)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
0.000173 seconds (22 CPU allocations: 960 bytes)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
0.000167 seconds (22 CPU allocations: 960 bytes)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
0.000177 seconds (22 CPU allocations: 960 bytes)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
0.000178 seconds (22 CPU allocations: 960 bytes)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
0.000566 seconds (47 CPU allocations: 2.844 KiB)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
0.000436 seconds (45 CPU allocations: 2.750 KiB)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_with_precompile!(A, B)
0.000997 seconds (47 CPU allocations: 2.875 KiB)
CUDA.HostKernel for kernel()
julia> using CUDA
julia> function copy_without_precompile!(A, B)
len = length(A)
@assert len === length(B)
function kernel()
i = threadIdx().x + blockDim().x * (blockIdx().x - 1)
if checkbounds(Bool, A, i) && checkbounds(Bool, B, i)
A[i] = B[i]
end
nothing
end
threads = 512
blocks = cld(len, threads)
@cuda threads = threads blocks = blocks kernel()
end
copy_without_precompile! (generic function with 1 method)
julia> A = CUDA.zeros(100, 100, 100);
julia> B = CUDA.zeros(100, 100, 100);
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
6.742919 seconds (9.52 M CPU allocations: 480.393 MiB, 0.92% gc time)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
0.036825 seconds (62.46 k CPU allocations: 3.523 MiB)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
0.000669 seconds (48 CPU allocations: 3.125 KiB)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
0.001013 seconds (57 CPU allocations: 3.156 KiB)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
0.001010 seconds (47 CPU allocations: 2.875 KiB)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
0.001066 seconds (23 CPU allocations: 976 bytes)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
0.001106 seconds (23 CPU allocations: 976 bytes)
CUDA.HostKernel for kernel()
julia> CUDA.@time CUDA.@sync copy_without_precompile!(A, B)
0.001036 seconds (23 CPU allocations: 976 bytes)
CUDA.HostKernel for kernel()
julia> versioninfo()
Julia Version 1.11.1
Commit 8f5b7ca12ad (2024-10-16 10:53 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 24 × 13th Gen Intel(R) Core(TM) i7-13700K
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, alderlake)
Threads: 1 default, 0 interactive, 1 GC (on 24 virtual cores)
Environment:
JULIA_EDITOR = code
JULIA_NUM_THREADS =
julia> CUDA.versioninfo()
CUDA runtime 12.6, artifact installation
CUDA driver 12.7
NVIDIA driver 566.3.0
CUDA libraries:
- CUBLAS: 12.6.3
- CURAND: 10.3.7
- CUFFT: 11.3.0
- CUSOLVER: 11.7.1
- CUSPARSE: 12.5.4
- CUPTI: 2024.3.2 (API 24.0.0)
- NVML: 12.0.0+565.57.2
Julia packages:
- CUDA: 5.5.2
- CUDA_Driver_jll: 0.10.3+0
- CUDA_Runtime_jll: 0.15.3+0
Toolchain:
- Julia: 1.11.1
- LLVM: 16.0.6
1 device:
0: NVIDIA GeForce RTX 4070 Ti (sm_89, 8.262 GiB / 11.994 GiB available)