A good way to tackle this is to create a minimal CUDA C implementation and compile it with Clang to reveal which exact intrinsics are being called. Often, there aren’t any intrinsics, and the CUDA headers contain inline PTX assembly. For example:
$ cat test.cu
__global__ void cuda_hello(){
printf("Hello World from thread %d!\n", threadIdx.x);
}
int main() {
cuda_hello<<<1,1>>>();
return 0;
}
$ clang++-11 --cuda-path=/opt/cuda --cuda-gpu-arch=sm_70 --cuda-device-only -emit-llvm -O0 -S test.cu -o test.ll
$ cat test.ll
; Function Attrs: convergent noinline norecurse nounwind optnone
define dso_local void @_Z10cuda_hellov() #1 {
%1 = alloca %printf_args, align 8
%2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !6
%3 = getelementptr inbounds %printf_args, %printf_args* %1, i32 0, i32 0
store i32 %2, i32* %3, align 4
%4 = bitcast %printf_args* %1 to i8*
%5 = call i32 @vprintf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i64 0, i64 0), i8* %4)
ret void
}