I am using Julia 1.9, CUDA.jl 4.0.1, NVIDIA GeForce GTX 1650. I could not quite figure out how to use cuDeviceGetProperties
, or I would have added more information.
On using @code_native
I found that Base.fma
was explicitly required to use FMAs. I am not used to reading sass, but I think this line is an FMA in the output of @code_device_sass
: DFMA R6, R6, c[0x0][0x1b8], R8 ;
.
So, on changing the CPU version to explicitly use Base.fma
, the tests pass for me and everything works out. I could have sworn that this was not the case when I tried this earlier today I must have messed something up then.
New tests:
@testset "What is my GPU doing?" begin
N = 1024
u = rand(N)
u_cu = CuVector(u)
@test Vector(u_cu) == u # OK
r = rand(N)
r_cu = CuVector(r)
@test Vector(r_cu) == r # OK
β = 3.0
u .= fma.(u, β, r) # <-------------------------------- changed to fma
u_cu .= u_cu .* β .+ r_cu
@test u == u_cu # BROKEN
end
Here is the entire sass from @code_device_sass
@device_code_sass kernel1!(u_cu, r_cu, β)
// PTX CompilerJob of kernel #broadcast_kernel#26(CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(*), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float64, 1}, Tuple{Bool}, Tuple{Int64}}, Float64}}, Base.Broadcast.Extruded{CuDeviceVector{Float64, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64) for sm_75
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
.elftype @"ET_EXEC"
//--------------------- .text._Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_ --------------------------
.section .text._Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,"ax",@progbits
.sectioninfo @"SHI_REGISTERS=18"
.align 128
.global _Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_
.type _Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,@function
.size _Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,(.L_x_8 - _Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_)
.other _Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_:
.text._Z27julia_broadcast_kernel_293515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_:
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:54
IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x1f8] ;
IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x1fc] ;
; Location ./int.jl:83
ISETP.GE.U32.AND P0, PT, R0, 0x1, PT ;
ISETP.GE.AND.EX P0, PT, R2, RZ, PT, P0 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
@!P0 EXIT ;
LDC.U8 R0, c[0x0][0x1a8] ;
S2R R6, SR_TID.X ;
IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0xc] ;
ULDC.U8 UR5, c[0x0][0x1e0] ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:54
IMAD.MOV.U32 R4, RZ, RZ, c[0x0][0x1f8] ;
S2R R5, SR_CTAID.X ;
ULOP3.LUT UR5, UR5, 0x1, URZ, 0xc0, !UPT ;
IMAD.MOV.U32 R7, RZ, RZ, RZ ;
LOP3.LUT R0, R0, 0x1, RZ, 0xc0, !PT ;
IADD3 R6, R6, 0x1, RZ ;
ISETP.NE.U32.AND P0, PT, R0, 0x1, PT ;
IMAD R0, R2, c[0x0][0x0], RZ ;
IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x1fc] ;
SHF.R.S32.HI R3, RZ, 0x1f, R0 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
@P0 BRA `(.L_x_0) ;
ISETP.NE.AND P0, PT, RZ, UR5, PT ;
@!P0 BRA `(.L_x_1) ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
IADD3 R9, P1, R6, -0x1, RZ ;
IADD3 R5, P0, P2, R9, 0x1, -R0 ;
IADD3.X R8, R7, -0x1, RZ, P1, !PT ;
; Location ./int.jl:83
IADD3 R5, P1, R0, R5, RZ ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
IADD3.X R6, R8, RZ, ~R3, P0, P2 ;
; Location ./int.jl:83
ISETP.GT.U32.AND P0, PT, R5, c[0x0][0x180], PT ;
IMAD.X R6, R3, 0x1, R6, P1 ;
ISETP.GT.AND.EX P0, PT, R6, c[0x0][0x184], PT, P0 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
@P0 EXIT ;
SHF.L.U64.HI R11, R9.reuse, 0x3, R8 ;
IMAD.SHL.U32 R10, R9, 0x8, RZ ;
SHF.L.U64.HI R14, R0, 0x3, R3 ;
; Location ./int.jl:83
IMAD.MOV.U32 R12, RZ, RZ, R6 ;
.L_x_2:
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
ULDC.64 UR4, c[0x0][0x188] ;
ULDC.64 UR6, c[0x0][0x1c0] ;
LDG.E.64.SYS R6, [R10.64+UR4] ;
LDG.E.64.SYS R8, [R10.64+UR6] ;
IADD3 R4, P0, R4, -0x1, RZ ;
ULDC.64 UR4, c[0x0][0x168] ;
; Location ./int.jl:83
IADD3 R5, P1, R0, R5, RZ ;
IADD3.X R2, R2, -0x1, RZ, P0, !PT ;
ISETP.NE.U32.AND P0, PT, R4, RZ, PT ;
ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0 ;
; Location ./float.jl:408
DFMA R6, R6, c[0x0][0x1b8], R8 ;
LEA R9, P2, R0, R10, 0x3 ;
; Location ./int.jl:83
IMAD.X R8, R3, 0x1, R12, P1 ;
ISETP.GT.U32.AND P1, PT, R5, c[0x0][0x180], PT ;
IMAD.X R12, R11, 0x1, R14, P2 ;
ISETP.GT.AND.EX P1, PT, R8, c[0x0][0x184], PT, P1 ;
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
STG.E.64.SYS [R10.64+UR4], R6 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
@!P0 EXIT ;
; Location ./int.jl:83
IMAD.MOV.U32 R11, RZ, RZ, R12 ;
IMAD.MOV.U32 R10, RZ, RZ, R9 ;
IMAD.MOV.U32 R12, RZ, RZ, R8 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
@!P1 BRA `(.L_x_2) ;
EXIT ;
.L_x_1:
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
ULDC.64 UR6, c[0x0][0x1e8] ;
ULDC.64 UR4, c[0x0][0x1c0] ;
IADD3 R5, P1, R6, -0x1, RZ ;
ULEA UR4, UP0, UR6, UR4, 0x3 ;
IADD3 R9, P0, P2, R5, 0x1, -R0 ;
ULEA.HI.X UR5, UR6, UR5, UR7, 0x3, UP0 ;
IADD3.X R6, R7, -0x1, RZ, P1, !PT ;
; Location ./int.jl:83
IADD3 R9, P1, R0, R9, RZ ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
IADD3.X R8, R6, RZ, ~R3, P0, P2 ;
; Location ./int.jl:83
ISETP.GT.U32.AND P0, PT, R9, c[0x0][0x180], PT ;
IMAD.X R12, R3, 0x1, R8, P1 ;
ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x184], PT, P0 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
@P0 EXIT ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
SHF.L.U64.HI R11, R5.reuse, 0x3, R6 ;
IMAD.SHL.U32 R10, R5, 0x8, RZ ;
SHF.L.U64.HI R14, R0, 0x3, R3 ;
; Location ./int.jl:83
IMAD.MOV.U32 R5, RZ, RZ, R9 ;
.L_x_3:
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
ULDC.64 UR6, c[0x0][0x188] ;
LDG.E.64.SYS R8, [UR4+-0x8] ;
LDG.E.64.SYS R6, [R10.64+UR6] ;
IADD3 R4, P0, R4, -0x1, RZ ;
IADD3.X R2, R2, -0x1, RZ, P0, !PT ;
; Location ./int.jl:83
ISETP.NE.U32.AND P0, PT, R4, RZ, PT ;
ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0 ;
IADD3 R5, P1, R0.reuse, R5, RZ ;
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
ULDC.64 UR6, c[0x0][0x168] ;
; Location ./float.jl:408
DFMA R6, R6, c[0x0][0x1b8], R8 ;
LEA R9, P2, R0, R10, 0x3 ;
; Location ./int.jl:83
IMAD.X R8, R3, 0x1, R12, P1 ;
ISETP.GT.U32.AND P1, PT, R5, c[0x0][0x180], PT ;
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
STG.E.64.SYS [R10.64+UR6], R6 ;
IMAD.X R12, R11, 0x1, R14, P2 ;
; Location ./int.jl:83
ISETP.GT.AND.EX P1, PT, R8, c[0x0][0x184], PT, P1 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
@!P0 EXIT ;
; Location ./int.jl:83
IMAD.MOV.U32 R11, RZ, RZ, R12 ;
IMAD.MOV.U32 R10, RZ, RZ, R9 ;
IMAD.MOV.U32 R12, RZ, RZ, R8 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
@!P1 BRA `(.L_x_3) ;
EXIT ;
.L_x_0:
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
ISETP.NE.AND P0, PT, RZ, UR5, PT ;
ULDC.64 UR8, c[0x0][0x1b0] ;
ULDC.64 UR6, c[0x0][0x188] ;
ULEA UR4, UP0, UR8, UR6, 0x3 ;
ULEA.HI.X UR5, UR8, UR7, UR9, 0x3, UP0 ;
@!P0 BRA `(.L_x_4) ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
IADD3 R5, P1, R6, -0x1, RZ ;
IADD3 R9, P0, P2, R5, 0x1, -R0 ;
IADD3.X R6, R7, -0x1, RZ, P1, !PT ;
; Location ./int.jl:83
IADD3 R9, P1, R0, R9, RZ ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
IADD3.X R8, R6, RZ, ~R3, P0, P2 ;
; Location ./int.jl:83
ISETP.GT.U32.AND P0, PT, R9, c[0x0][0x180], PT ;
IMAD.X R12, R3, 0x1, R8, P1 ;
ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x184], PT, P0 ;
@P0 EXIT ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
SHF.L.U64.HI R11, R5.reuse, 0x3, R6 ;
IMAD.SHL.U32 R10, R5, 0x8, RZ ;
SHF.L.U64.HI R14, R0, 0x3, R3 ;
; Location ./int.jl:83
IMAD.MOV.U32 R5, RZ, RZ, R9 ;
.L_x_5:
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
ULDC.64 UR6, c[0x0][0x1c0] ;
LDG.E.64.SYS R6, [UR4+-0x8] ;
LDG.E.64.SYS R8, [R10.64+UR6] ;
IADD3 R4, P0, R4, -0x1, RZ ;
ULDC.64 UR6, c[0x0][0x168] ;
; Location ./int.jl:83
IADD3 R5, P1, R0, R5, RZ ;
IADD3.X R2, R2, -0x1, RZ, P0, !PT ;
ISETP.NE.U32.AND P0, PT, R4, RZ, PT ;
ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0 ;
; Location ./float.jl:408
DFMA R6, R6, c[0x0][0x1b8], R8 ;
LEA R9, P2, R0, R10, 0x3 ;
; Location ./int.jl:83
IMAD.X R8, R3, 0x1, R12, P1 ;
ISETP.GT.U32.AND P1, PT, R5, c[0x0][0x180], PT ;
IMAD.X R12, R11, 0x1, R14, P2 ;
ISETP.GT.AND.EX P1, PT, R8, c[0x0][0x184], PT, P1 ;
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
STG.E.64.SYS [R10.64+UR6], R6 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
@!P0 EXIT ;
; Location ./int.jl:83
IMAD.MOV.U32 R11, RZ, RZ, R12 ;
IMAD.MOV.U32 R10, RZ, RZ, R9 ;
IMAD.MOV.U32 R12, RZ, RZ, R8 ;
@!P1 BRA `(.L_x_5) ;
EXIT ;
.L_x_4:
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
ULDC.64 UR6, c[0x0][0x1e8] ;
ULDC.64 UR8, c[0x0][0x1c0] ;
IADD3 R11, P2, R6, -0x1, RZ ;
ULEA UR8, UP0, UR6, UR8, 0x3 ;
IADD3 R5, P0, P1, R11, 0x1, -R0 ;
ULEA.HI.X UR7, UR6, UR9, UR7, 0x3, UP0 ;
IADD3.X R6, R7, -0x1, RZ, P2, !PT ;
; Location ./int.jl:83
IADD3 R5, P2, R0, R5, RZ ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
IADD3.X R8, R6, RZ, ~R3, P0, P1 ;
; Location ./int.jl:83
ISETP.GT.U32.AND P0, PT, R5, c[0x0][0x180], PT ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
LEA R13, P1, R11, c[0x0][0x168], 0x3 ;
; Location ./int.jl:83
IMAD.X R12, R3, 0x1, R8, P2 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
LEA.HI.X R11, R11, c[0x0][0x16c], R6, 0x3, P1 ;
; Location ./int.jl:83
ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x184], PT, P0 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
@P0 EXIT ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
SHF.L.U64.HI R14, R0, 0x3, R3 ;
.L_x_6:
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
UMOV UR6, UR8 ;
LDG.E.64.SYS R6, [UR4+-0x8] ;
LDG.E.64.SYS R8, [UR6+-0x8] ;
; Location ./int.jl:83
IADD3 R15, P1, R4, -0x1, RZ ;
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
IMAD.MOV.U32 R4, RZ, RZ, R13 ;
; Location ./int.jl:83
LEA R10, P2, R0, R13, 0x3 ;
IADD3.X R2, R2, -0x1, RZ, P1, !PT ;
ISETP.NE.U32.AND P1, PT, R15, RZ, PT ;
ISETP.NE.AND.EX P1, PT, R2, RZ, PT, P1 ;
; Location ./float.jl:408
DFMA R6, R6, c[0x0][0x1b8], R8 ;
; Location ./int.jl:83
IADD3 R8, P0, R0, R5, RZ ;
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
IMAD.MOV.U32 R5, RZ, RZ, R11 ;
; Location ./int.jl:83
IMAD.X R11, R11, 0x1, R14, P2 ;
IMAD.X R9, R3, 0x1, R12, P0 ;
ISETP.GT.U32.AND P0, PT, R8, c[0x0][0x180], PT ;
; Location /home/chaitanya/.julia/packages/LLVM/HykgZ/src/interop/base.jl:40
STG.E.64.SYS [R4], R6 ;
; Location ./int.jl:83
ISETP.GT.AND.EX P0, PT, R9, c[0x0][0x184], PT, P0 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:56
@!P1 EXIT ;
; Location ./int.jl:83
IMAD.MOV.U32 R4, RZ, RZ, R15 ;
IMAD.MOV.U32 R13, RZ, RZ, R10 ;
IMAD.MOV.U32 R5, RZ, RZ, R8 ;
IMAD.MOV.U32 R12, RZ, RZ, R9 ;
; Location /home/chaitanya/.julia/packages/GPUArrays/t0LfC/src/device/indexing.jl:67
@!P0 BRA `(.L_x_6) ;
EXIT ;
.L_x_7:
BRA `(.L_x_7);
.L_x_8: