CUDA.jl tex2DLayered support?

A good way to tackle this is to create a minimal CUDA C implementation and compile it with Clang to reveal which exact intrinsics are being called. Often, there aren’t any intrinsics, and the CUDA headers contain inline PTX assembly. For example:

$ cat test.cu                                                                                                                                                                                                              
__global__ void cuda_hello(){
    printf("Hello World from thread %d!\n", threadIdx.x);
}

int main() {
    cuda_hello<<<1,1>>>(); 
    return 0;
}


$ clang++-11 --cuda-path=/opt/cuda --cuda-gpu-arch=sm_70 --cuda-device-only -emit-llvm -O0 -S test.cu -o test.ll                                                                             

$ cat test.ll                                                                                                                                                                                                              
; Function Attrs: convergent noinline norecurse nounwind optnone
define dso_local void @_Z10cuda_hellov() #1 {
  %1 = alloca %printf_args, align 8
  %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3, !range !6
  %3 = getelementptr inbounds %printf_args, %printf_args* %1, i32 0, i32 0
  store i32 %2, i32* %3, align 4
  %4 = bitcast %printf_args* %1 to i8*
  %5 = call i32 @vprintf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i64 0, i64 0), i8* %4)
  ret void
}
1 Like