Many constants in CUDA world are 32-bit, eg. the warp-size, thread or block IDs …and dimensions, etc. We don't promote these to Int64 in order to avoid conversions when doing math on them, however it might be equally expensive not to do so because of conversions when doing math with literals.
For example, take the following idiomatic code:
```julia
function reduce_warp{F<:Function,T}(op::F, val::T)::T
offset = CUDAnative.warpsize() ÷ 2
while offset > 0
val = op(val, shfl_down(val, offset))
offset ÷= 2
end
return val
end
```
`warpsize` yields an Int32, but gets converted and promoted to Int64 because of the `÷ 2`. This in turn causes `shf_down` which takes an Int32 do convert it back, including an exactness check + exception (trap):
```llvm
julia> CUDAnative.code_llvm(reduce_warp, (typeof(+), Int32))
define i32 @julia_reduce_warp_62748(i32) local_unnamed_addr # {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
%2 = icmp slt i32 %1, 2
br i1 %2, label %L23, label %if.preheader
if.preheader: ; preds = %top
%3 = lshr i32 %1, 1
%4 = zext i32 %3 to i64
br label %if
if: ; preds = %if.preheader, %pass2
%val.03 = phi i32 [ %9, %pass2 ], [ %0, %if.preheader ]
%offset.02 = phi i64 [ %10, %pass2 ], [ %4, %if.preheader ]
%sext = shl i64 %offset.02, 32
%5 = ashr exact i64 %sext, 32
%6 = icmp eq i64 %5, %offset.02
br i1 %6, label %pass2, label %fail1
L23.loopexit: ; preds = %pass2
br label %L23
L23: ; preds = %L23.loopexit, %top
%val.0.lcssa = phi i32 [ %0, %top ], [ %9, %L23.loopexit ]
ret i32 %val.0.lcssa
fail1: ; preds = %if
tail call void @llvm.trap()
unreachable
pass2: ; preds = %if
%7 = trunc i64 %offset.02 to i32
%8 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 %val.03, i32 %7, i32 31)
%9 = add i32 %8, %val.03
%10 = lshr i64 %offset.02, 1
%11 = icmp eq i64 %10, 0
br i1 %11, label %L23.loopexit, label %if
}
```
An improved, but less readable version of the same code goes like:
```julia
function reduce_warp{F<:Function,T}(op::F, val::T)::T
offset = CUDAnative.warpsize() ÷ Int32(2)
while offset > Int32(0)
val = op(val, shfl_down(val, offset))
offset ÷= Int32(2)
end
return val
end
```
This yields the following, much cleaner IR:
```llvm
define i32 @julia_reduce_warp_62749(i32) local_unnamed_addr #0 {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
%2 = icmp slt i32 %1, 2
br i1 %2, label %L25, label %if.preheader
if.preheader: ; preds = %top
br label %if
if: ; preds = %if.preheader, %if
%offset.03.in = phi i32 [ %offset.03, %if ], [ %1, %if.preheader ]
%val.02 = phi i32 [ %4, %if ], [ %0, %if.preheader ]
%offset.03 = sdiv i32 %offset.03.in, 2
%3 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 %val.02, i32 %offset.03, i32 31)
%4 = add i32 %3, %val.02
%5 = icmp slt i32 %offset.03.in, 4
br i1 %5, label %L25.loopexit, label %if
L25.loopexit: ; preds = %if
br label %L25
L25: ; preds = %L25.loopexit, %top
%val.0.lcssa = phi i32 [ %0, %top ], [ %4, %L25.loopexit ]
ret i32 %val.0.lcssa
}
```