I don’t have enough experience with PTX (or LLVM) to be able to properly understand the generated code, but the inbounds kernel is much shorter than the noinbounds kernel (less than half the length). I’m pretty sure the latter does not get unrolled (.pragma "nounroll";
), but I don’t know what’s happening in the former.
inbounds PTX
// PTX CompilerJob of MethodInstance for inbounds_kernel!(::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceVector{Int32, 1}) for sm_86
//
// Generated by LLVM NVPTX Back-End
//
.version 8.5
.target sm_86
.address_size 64
// .globl _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE // -- Begin function _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE
// @_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE
.visible .entry _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE(
.param .align 8 .b8 _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_0[16],
.param .align 8 .b8 _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1[48],
.param .align 8 .b8 _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2[48],
.param .align 8 .b8 _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3[48],
.param .align 8 .b8 _Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_4[32]
)
{
.reg .pred %p<7>;
.reg .b32 %r<12>;
.reg .f32 %f<6>;
.reg .b64 %rd<80>;
.reg .f64 %fd<7>;
// %bb.0: // %conversion
ld.param.u64 %rd58, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_4+24];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r1, %ntid.x;
mad.lo.s32 %r8, %r7, %r1, %r6;
add.s32 %r11, %r8, 1;
cvt.s64.s32 %rd59, %r11;
setp.lt.s64 %p1, %rd58, %rd59;
@%p1 bra $L__BB0_9;
bra.uni $L__BB0_1;
$L__BB0_9: // %L314
ret;
$L__BB0_1: // %L44.lr.ph
ld.param.u64 %rd55, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_4];
ld.param.u64 %rd52, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3+24];
ld.param.u64 %rd51, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3+16];
ld.param.u64 %rd49, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3];
ld.param.u64 %rd46, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2+24];
ld.param.u64 %rd45, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2+16];
ld.param.u64 %rd43, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2];
ld.param.u64 %rd2, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1+24];
ld.param.u64 %rd39, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1+16];
ld.param.u64 %rd37, [_Z16inbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1];
mov.u32 %r9, %nctaid.x;
mul.lo.s32 %r3, %r1, %r9;
mul.lo.s64 %rd60, %rd2, %rd39;
shl.b64 %rd12, %rd60, 2;
shl.b64 %rd13, %rd39, 2;
mul.lo.s64 %rd61, %rd52, %rd51;
shl.b64 %rd14, %rd61, 2;
shl.b64 %rd15, %rd51, 2;
mul.lo.s64 %rd62, %rd46, %rd45;
shl.b64 %rd16, %rd62, 2;
shl.b64 %rd17, %rd45, 2;
setp.lt.s64 %p2, %rd39, 1;
setp.lt.s64 %p3, %rd2, 1;
bra.uni $L__BB0_2;
$L__BB0_8: // %L299
// in Loop: Header=BB0_2 Depth=1
add.s32 %r11, %r11, %r3;
cvt.s64.s32 %rd71, %r11;
setp.ge.s64 %p6, %rd58, %rd71;
@%p6 bra $L__BB0_2;
bra.uni $L__BB0_9;
$L__BB0_2: // %L44
// =>This Loop Header: Depth=1
// Child Loop BB0_4 Depth 2
// Child Loop BB0_6 Depth 3
@%p2 bra $L__BB0_8;
// %bb.3: // %L62.preheader
// in Loop: Header=BB0_2 Depth=1
add.s32 %r10, %r11, -1;
mul.wide.s32 %rd64, %r10, 4;
add.s64 %rd65, %rd55, %rd64;
ld.global.s32 %rd66, [%rd65];
add.s64 %rd67, %rd66, -1;
mul.lo.s64 %rd68, %rd12, %rd67;
add.s64 %rd74, %rd37, %rd68;
mul.lo.s64 %rd69, %rd14, %rd67;
add.s64 %rd73, %rd49, %rd69;
mul.lo.s64 %rd70, %rd16, %rd67;
add.s64 %rd72, %rd43, %rd70;
mov.u64 %rd75, 1;
bra.uni $L__BB0_4;
$L__BB0_7: // %L288
// in Loop: Header=BB0_4 Depth=2
add.s64 %rd33, %rd75, 1;
add.s64 %rd74, %rd74, %rd13;
add.s64 %rd73, %rd73, %rd15;
add.s64 %rd72, %rd72, %rd17;
setp.ne.s64 %p5, %rd75, %rd39;
mov.u64 %rd75, %rd33;
@%p5 bra $L__BB0_4;
bra.uni $L__BB0_8;
$L__BB0_4: // %L62
// Parent Loop BB0_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_6 Depth 3
@%p3 bra $L__BB0_7;
// %bb.5: // %L76.preheader
// in Loop: Header=BB0_4 Depth=2
mov.u64 %rd76, %rd2;
mov.u64 %rd77, %rd72;
mov.u64 %rd78, %rd73;
mov.u64 %rd79, %rd74;
$L__BB0_6: // %L76
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_4 Depth=2
// => This Inner Loop Header: Depth=3
ld.global.f32 %f1, [%rd77];
ld.global.f32 %f2, [%rd78];
cvt.f64.f32 %fd1, %f2;
mul.f64 %fd2, %fd1, 0d3FECCCCCCCCCCCCD;
cvt.f64.f32 %fd3, %f1;
fma.rn.f64 %fd4, %fd3, 0d3FB999999999999A, %fd2;
cvt.rn.f32.f64 %f3, %fd4;
st.global.f32 [%rd78], %f3;
ld.global.f32 %f4, [%rd79];
cvt.f64.f32 %fd5, %f4;
sub.f64 %fd6, %fd5, %fd4;
cvt.rn.f32.f64 %f5, %fd6;
st.global.f32 [%rd79], %f5;
add.s64 %rd79, %rd79, 4;
add.s64 %rd78, %rd78, 4;
add.s64 %rd77, %rd77, 4;
add.s64 %rd76, %rd76, -1;
setp.ne.s64 %p4, %rd76, 0;
@%p4 bra $L__BB0_6;
bra.uni $L__BB0_7;
// -- End function
}
noinbounds PTX
// PTX CompilerJob of MethodInstance for noinbounds_kernel!(::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceVector{Int32, 1}) for sm_86
//
// Generated by LLVM NVPTX Back-End
//
.version 8.5
.target sm_86
.address_size 64
// .globl _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE // -- Begin function _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE
.extern .func julia_throw_boundserror_20419
(
.param .align 8 .b8 julia_throw_boundserror_20419_param_0[16]
)
;
// @_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE
.visible .entry _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE(
.param .align 8 .b8 _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_0[16],
.param .align 8 .b8 _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1[48],
.param .align 8 .b8 _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2[48],
.param .align 8 .b8 _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3[48],
.param .align 8 .b8 _Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_4[32]
)
{
.reg .pred %p<22>;
.reg .b32 %r<11>;
.reg .f32 %f<16>;
.reg .b64 %rd<172>;
.reg .f64 %fd<19>;
// %bb.0: // %conversion
ld.param.u64 %rd94, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_4+24];
mov.u32 %r5, %tid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r1, %ntid.x;
mad.lo.s32 %r7, %r6, %r1, %r5;
add.s32 %r8, %r7, 1;
cvt.s64.s32 %rd152, %r8;
setp.lt.s64 %p1, %rd94, %rd152;
@%p1 bra $L__BB0_21;
bra.uni $L__BB0_1;
$L__BB0_21: // %L314
ret;
$L__BB0_1: // %L31.lr.ph
ld.param.u64 %rd93, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_4+16];
ld.param.u64 %rd91, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_4];
ld.param.u64 %rd90, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3+40];
ld.param.u64 %rd88, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3+24];
ld.param.u64 %rd87, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3+16];
ld.param.u64 %rd85, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_3];
ld.param.u64 %rd84, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2+40];
ld.param.u64 %rd82, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2+24];
ld.param.u64 %rd81, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2+16];
ld.param.u64 %rd79, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_2];
ld.param.u64 %rd78, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1+40];
ld.param.u64 %rd76, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1+24];
ld.param.u64 %rd75, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1+16];
ld.param.u64 %rd73, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_1];
ld.param.u32 %r4, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_0+8];
ld.param.u64 %rd72, [_Z18noinbounds_kernel_13CuDeviceArrayI7Float32Lx3ELx1EES_IS0_Lx3ELx1EES_IS0_Lx3ELx1EES_I5Int32Lx1ELx1EE_param_0];
max.s64 %rd14, %rd84, 0;
max.s64 %rd15, %rd90, 0;
max.s64 %rd16, %rd78, 0;
mov.u32 %r9, %nctaid.x;
mul.lo.s32 %r2, %r1, %r9;
mul.lo.s64 %rd95, %rd76, %rd75;
shl.b64 %rd17, %rd95, 2;
shl.b64 %rd18, %rd75, 2;
mul.lo.s64 %rd96, %rd88, %rd87;
shl.b64 %rd19, %rd96, 2;
shl.b64 %rd20, %rd87, 2;
mul.lo.s64 %rd97, %rd82, %rd81;
shl.b64 %rd21, %rd97, 2;
shl.b64 %rd22, %rd81, 2;
add.s64 %rd23, %rd76, 1;
setp.lt.s64 %p5, %rd75, 1;
setp.lt.s64 %p6, %rd76, 1;
bra.uni $L__BB0_2;
$L__BB0_20: // %L299
// in Loop: Header=BB0_2 Depth=1
add.s32 %r10, %r2, %r3;
cvt.s64.s32 %rd152, %r10;
setp.ge.s64 %p21, %rd94, %rd152;
@%p21 bra $L__BB0_2;
bra.uni $L__BB0_21;
$L__BB0_2: // %L31
// =>This Loop Header: Depth=1
// Child Loop BB0_6 Depth 2
// Child Loop BB0_9 Depth 3
// Child Loop BB0_28 Depth 3
// Child Loop BB0_15 Depth 3
cvt.u32.u64 %r3, %rd152;
setp.gt.s32 %p2, %r3, 0;
setp.ge.s64 %p3, %rd93, %rd152;
and.pred %p4, %p2, %p3;
@%p4 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4: // %L44
// in Loop: Header=BB0_2 Depth=1
@%p5 bra $L__BB0_20;
// %bb.5: // %L62.preheader
// in Loop: Header=BB0_2 Depth=1
add.s64 %rd100, %rd152, -1;
and.b64 %rd101, %rd100, 4294967295;
shl.b64 %rd102, %rd101, 2;
add.s64 %rd103, %rd91, %rd102;
ld.global.s32 %rd104, [%rd103];
add.s64 %rd105, %rd104, -1;
mul.lo.s64 %rd106, %rd105, %rd82;
mul.lo.s64 %rd107, %rd105, %rd88;
mul.lo.s64 %rd108, %rd105, %rd76;
mul.lo.s64 %rd25, %rd107, %rd87;
mul.lo.s64 %rd26, %rd106, %rd81;
mul.lo.s64 %rd27, %rd108, %rd75;
mul.lo.s64 %rd109, %rd17, %rd105;
add.s64 %rd36, %rd73, %rd109;
mul.lo.s64 %rd110, %rd19, %rd105;
add.s64 %rd34, %rd85, %rd110;
mul.lo.s64 %rd111, %rd21, %rd105;
add.s64 %rd32, %rd79, %rd111;
mov.u64 %rd160, 1;
mov.u64 %rd37, 0;
mov.u64 %rd31, %rd26;
mov.u64 %rd33, %rd25;
mov.u64 %rd35, %rd27;
bra.uni $L__BB0_6;
$L__BB0_19: // %L288
// in Loop: Header=BB0_6 Depth=2
add.s64 %rd160, %rd38, 1;
add.s64 %rd37, %rd37, 1;
add.s64 %rd36, %rd36, %rd18;
add.s64 %rd35, %rd35, %rd75;
add.s64 %rd34, %rd34, %rd20;
add.s64 %rd33, %rd33, %rd87;
add.s64 %rd32, %rd32, %rd22;
add.s64 %rd31, %rd31, %rd81;
setp.ne.s64 %p20, %rd38, %rd75;
@%p20 bra $L__BB0_6;
bra.uni $L__BB0_20;
$L__BB0_6: // %L62
// Parent Loop BB0_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_9 Depth 3
// Child Loop BB0_28 Depth 3
// Child Loop BB0_15 Depth 3
mov.u64 %rd38, %rd160;
@%p6 bra $L__BB0_19;
// %bb.7: // %L76.preheader
// in Loop: Header=BB0_6 Depth=2
mul.lo.s64 %rd112, %rd87, %rd37;
add.s64 %rd113, %rd25, %rd112;
min.s64 %rd114, %rd113, 0;
neg.s64 %rd115, %rd114;
mul.lo.s64 %rd116, %rd81, %rd37;
add.s64 %rd117, %rd26, %rd116;
min.s64 %rd118, %rd117, 0;
neg.s64 %rd119, %rd118;
max.u64 %rd120, %rd115, %rd119;
mul.lo.s64 %rd121, %rd75, %rd37;
add.s64 %rd122, %rd27, %rd121;
min.s64 %rd123, %rd122, 0;
neg.s64 %rd124, %rd123;
max.u64 %rd125, %rd120, %rd124;
min.u64 %rd39, %rd76, %rd125;
setp.eq.s64 %p7, %rd39, 0;
mov.u64 %rd169, 0;
mov.u64 %rd163, 1;
@%p7 bra $L__BB0_26;
// %bb.8: // %L76.preloop.preheader
// in Loop: Header=BB0_6 Depth=2
mov.u64 %rd169, 0;
mov.u64 %rd164, %rd32;
mov.u64 %rd165, %rd34;
mov.u64 %rd166, %rd36;
$L__BB0_9: // %L76.preloop
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_6 Depth=2
// => This Inner Loop Header: Depth=3
.pragma "nounroll";
add.s64 %rd137, %rd31, %rd169;
setp.ge.u64 %p8, %rd137, %rd14;
@%p8 bra $L__BB0_10;
// %bb.22: // %L109.preloop
// in Loop: Header=BB0_9 Depth=3
add.s64 %rd138, %rd33, %rd169;
setp.ge.u64 %p9, %rd138, %rd15;
@%p9 bra $L__BB0_11;
// %bb.23: // %L147.preloop
// in Loop: Header=BB0_9 Depth=3
ld.global.f32 %f1, [%rd164];
ld.global.f32 %f2, [%rd165];
cvt.f64.f32 %fd3, %f2;
mul.f64 %fd4, %fd3, 0d3FECCCCCCCCCCCCD;
cvt.f64.f32 %fd5, %f1;
fma.rn.f64 %fd1, %fd5, 0d3FB999999999999A, %fd4;
cvt.rn.f32.f64 %f3, %fd1;
st.global.f32 [%rd165], %f3;
add.s64 %rd139, %rd35, %rd169;
setp.ge.u64 %p10, %rd139, %rd16;
@%p10 bra $L__BB0_12;
// %bb.24: // %L270.preloop
// in Loop: Header=BB0_9 Depth=3
ld.global.f32 %f4, [%rd166];
cvt.f64.f32 %fd6, %f4;
sub.f64 %fd7, %fd6, %fd1;
cvt.rn.f32.f64 %f5, %fd7;
st.global.f32 [%rd166], %f5;
add.s64 %rd169, %rd169, 1;
add.s64 %rd166, %rd166, 4;
add.s64 %rd165, %rd165, 4;
add.s64 %rd164, %rd164, 4;
setp.lt.u64 %p11, %rd169, %rd39;
@%p11 bra $L__BB0_9;
// %bb.25: // %preloop.exit.selector
// in Loop: Header=BB0_6 Depth=2
setp.ge.u64 %p12, %rd169, %rd76;
add.s64 %rd163, %rd169, 1;
@%p12 bra $L__BB0_19;
$L__BB0_26: // %preloop.pseudo.exit
// in Loop: Header=BB0_6 Depth=2
min.s64 %rd126, %rd15, %rd113;
sub.s64 %rd127, %rd15, %rd126;
min.u64 %rd128, %rd76, %rd127;
min.s64 %rd129, %rd14, %rd117;
sub.s64 %rd130, %rd14, %rd129;
min.u64 %rd131, %rd128, %rd130;
min.s64 %rd132, %rd16, %rd122;
sub.s64 %rd133, %rd16, %rd132;
min.u64 %rd40, %rd131, %rd133;
setp.ge.u64 %p13, %rd169, %rd40;
@%p13 bra $L__BB0_14;
// %bb.27: // %L76.preheader13
// in Loop: Header=BB0_6 Depth=2
shl.b64 %rd161, %rd163, 2;
$L__BB0_28: // %L76
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_6 Depth=2
// => This Inner Loop Header: Depth=3
mov.u64 %rd42, %rd163;
add.s64 %rd140, %rd32, %rd161;
ld.global.f32 %f6, [%rd140+-4];
add.s64 %rd141, %rd34, %rd161;
ld.global.f32 %f7, [%rd141+-4];
cvt.f64.f32 %fd8, %f7;
mul.f64 %fd9, %fd8, 0d3FECCCCCCCCCCCCD;
cvt.f64.f32 %fd10, %f6;
fma.rn.f64 %fd11, %fd10, 0d3FB999999999999A, %fd9;
cvt.rn.f32.f64 %f8, %fd11;
st.global.f32 [%rd141+-4], %f8;
add.s64 %rd142, %rd36, %rd161;
ld.global.f32 %f9, [%rd142+-4];
cvt.f64.f32 %fd12, %f9;
sub.f64 %fd13, %fd12, %fd11;
cvt.rn.f32.f64 %f10, %fd13;
st.global.f32 [%rd142+-4], %f10;
add.s64 %rd163, %rd42, 1;
add.s64 %rd161, %rd161, 4;
setp.lt.u64 %p14, %rd42, %rd40;
@%p14 bra $L__BB0_28;
// %bb.13: // %main.exit.selector
// in Loop: Header=BB0_6 Depth=2
setp.ge.u64 %p15, %rd42, %rd76;
@%p15 bra $L__BB0_19;
$L__BB0_14: // %main.pseudo.exit
// in Loop: Header=BB0_6 Depth=2
shl.b64 %rd170, %rd163, 2;
$L__BB0_15: // %L76.postloop
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_6 Depth=2
// => This Inner Loop Header: Depth=3
.pragma "nounroll";
add.s64 %rd143, %rd31, %rd163;
add.s64 %rd144, %rd143, -1;
setp.ge.u64 %p16, %rd144, %rd14;
@%p16 bra $L__BB0_10;
// %bb.16: // %L109.postloop
// in Loop: Header=BB0_15 Depth=3
add.s64 %rd145, %rd33, %rd163;
add.s64 %rd146, %rd145, -1;
setp.ge.u64 %p17, %rd146, %rd15;
@%p17 bra $L__BB0_11;
// %bb.17: // %L147.postloop
// in Loop: Header=BB0_15 Depth=3
add.s64 %rd147, %rd32, %rd170;
ld.global.f32 %f11, [%rd147+-4];
add.s64 %rd148, %rd34, %rd170;
ld.global.f32 %f12, [%rd148+-4];
cvt.f64.f32 %fd14, %f12;
mul.f64 %fd15, %fd14, 0d3FECCCCCCCCCCCCD;
cvt.f64.f32 %fd16, %f11;
fma.rn.f64 %fd2, %fd16, 0d3FB999999999999A, %fd15;
cvt.rn.f32.f64 %f13, %fd2;
st.global.f32 [%rd148+-4], %f13;
add.s64 %rd149, %rd35, %rd163;
add.s64 %rd150, %rd149, -1;
setp.ge.u64 %p18, %rd150, %rd16;
@%p18 bra $L__BB0_12;
// %bb.18: // %L270.postloop
// in Loop: Header=BB0_15 Depth=3
add.s64 %rd151, %rd36, %rd170;
ld.global.f32 %f14, [%rd151+-4];
cvt.f64.f32 %fd17, %f14;
sub.f64 %fd18, %fd17, %fd2;
cvt.rn.f32.f64 %f15, %fd18;
st.global.f32 [%rd151+-4], %f15;
add.s64 %rd163, %rd163, 1;
add.s64 %rd170, %rd170, 4;
setp.eq.s64 %p19, %rd23, %rd163;
@%p19 bra $L__BB0_19;
bra.uni $L__BB0_15;
$L__BB0_10: // %L106
{ // callseq 22, 0
.reg .b32 temp_param_reg;
.param .align 8 .b8 param0[16];
st.param.b64 [param0+0], %rd72;
st.param.b32 [param0+8], %r4;
call.uni
julia_throw_boundserror_20419,
(
param0
);
} // callseq 22
trap;
// begin inline asm
exit;
// end inline asm
$L__BB0_11: // %L144
{ // callseq 23, 0
.reg .b32 temp_param_reg;
.param .align 8 .b8 param0[16];
st.param.b64 [param0+0], %rd72;
st.param.b32 [param0+8], %r4;
call.uni
julia_throw_boundserror_20419,
(
param0
);
} // callseq 23
trap;
// begin inline asm
exit;
// end inline asm
$L__BB0_12: // %L226
{ // callseq 24, 0
.reg .b32 temp_param_reg;
.param .align 8 .b8 param0[16];
st.param.b64 [param0+0], %rd72;
st.param.b32 [param0+8], %r4;
call.uni
julia_throw_boundserror_20419,
(
param0
);
} // callseq 24
trap;
// begin inline asm
exit;
// end inline asm
$L__BB0_3: // %L41
{ // callseq 25, 0
.reg .b32 temp_param_reg;
.param .align 8 .b8 param0[16];
st.param.b64 [param0+0], %rd72;
st.param.b32 [param0+8], %r4;
call.uni
julia_throw_boundserror_20419,
(
param0
);
} // callseq 25
trap;
// begin inline asm
exit;
// end inline asm
// -- End function
}
I find it very hard to draw any conclusions from this. I guess the main takeaway is that for optimal performance you should default to @inbounds while/for ...
, when proven safe, and perhaps check if some hypothetical @nounroll @inbounds
would work better? Though I imagine the results will be device (/architecture) dependent?