Poor vectorization when comparing unsigned integers

The following almost identical functions lead to quite different LLVM IR:

function f(v::NTuple{N}, w::NTuple{N}, i) where N
    i = i % Int16
    ntuple(Val(N)) do j
        j = j % Int16
        ifelse(j <= i, v[j],  w[j])
    end
end

function g(v::NTuple{N}, w::NTuple{N}, i) where N
    i = i % UInt16
    ntuple(Val(N)) do j
        j = j % UInt16
        ifelse(j <= i, v[j],  w[j])
    end
end

With

t = ntuple(Int16, 8)  # same for UInt16
i = 3

I get nicely vectorized code for f (using Int16)

julia> @code_llvm f(t, t, i)
; Function Signature: f(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
;  @ REPL[1]:1 within `f`
define void @julia_f_3601(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
;  @ REPL[1]:2 within `f`
; β”Œ @ int.jl:550 within `rem`
   %0 = trunc i64 %"i::Int64" to i16
; β””
;  @ REPL[1]:3 within `f`
; β”Œ @ ntuple.jl:71 within `ntuple`
; β”‚β”Œ @ ntuple.jl:74 within `macro expansion`
; β”‚β”‚β”Œ @ REPL[1]:5 within `#f##0`
; β”‚β”‚β”‚β”Œ @ int.jl:520 within `<=`
      %1 = insertelement <8 x i16> poison, i16 %0, i64 0
      %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> zeroinitializer
      %3 = icmp slt <8 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:799 within `ifelse`
      %4 = load <8 x i16>, ptr %"v::Tuple", align 2
      %5 = load <8 x i16>, ptr %"w::Tuple", align 2
      %6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4
; β”‚β”‚β””β””
    store <8 x i16> %6, ptr %sret_return, align 2
    ret void
; β””β””
}

but different IR for g (using UInt16)

julia> @code_llvm g(t, t, i)
; Function Signature: g(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
;  @ REPL[2]:1 within `g`
define void @julia_g_3634(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
;  @ REPL[2]:2 within `g`
; β”Œ @ int.jl:550 within `rem`
   %0 = trunc i64 %"i::Int64" to i16
; β””
;  @ REPL[2]:3 within `g`
; β”Œ @ ntuple.jl:71 within `ntuple`
; β”‚β”Œ @ ntuple.jl:74 within `macro expansion`
; β”‚β”‚β”Œ @ REPL[2]:5 within `#g##0`
; β”‚β”‚β”‚β”Œ @ int.jl:521 within `<=`
      %1 = insertelement <8 x i16> <i16 0, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>, i16 %0, i64 1
      %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
      %3 = insertelement <8 x i16> <i16 poison, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 %0, i64 0
      %4 = icmp eq <8 x i16> %2, %3
      %5 = icmp ult <8 x i16> %2, %3
      %6 = shufflevector <8 x i1> %4, <8 x i1> %5, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:799 within `ifelse`
      %7 = load <8 x i16>, ptr %"v::Tuple", align 2
      %8 = load <8 x i16>, ptr %"w::Tuple", align 2
      %9 = select <8 x i1> %6, <8 x i16> %8, <8 x i16> %7
; β”‚β”‚β””β””
    store <8 x i16> %9, ptr %sret_return, align 2
    ret void
; β””β””
}

In assembly instructions the difference is quite marked, and for tuples of length 16 the function g is more than 50% slower than f. I don’t understand why the IR for g is not analogous to that for f, with icmp slt replaced by icmp ult. Is this an LLVM bug?

2 Likes

The values of v and w are not compared, only the variables i and j. For f they are both Int16 and for g they are both UInt16. I still see no reason why the IR for g should not be identical to the IR for f with icmp slt replaced by icmp ult.

Sorry for misreading the functions. That is indeed very weird. Following along the LLVM IR, the signed version correctly vectorized: [i<1, ..., i<8] but the unsigned version is bizzare: [i==0, i<2, ..., i<8]. Maybe LLVM prematurely optimized unsigned < 1 to become unsigned == 0?

I suspect this has to do with LLVM’s optimization passes’ order, I would look into disabling certain passes with opt.

I figured something out with export JULIA_LLVM_ARGS=-print-after=BeforeVectorization; julia.

--- f(t, t, i)
+++ g(t, t, i)
- ; *** IR Dump After BeforeVectorizationMarkerPass on julia_f_432 ***
- define swiftcc void @julia_f_432(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %0, ptr nonnull swiftself %1, ptr addrspace(11) nocapture noundef nonnull readonly align 2 dereferenceable(16) %2, ptr addrspace(11) nocapture noundef nonnull readonly align 2 dereferenceable(16) %3, i64 signext %4) #0 !dbg !6 {
+ ; *** IR Dump After BeforeVectorizationMarkerPass on julia_g_156 ***
+ define swiftcc void @julia_g_156(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %0, ptr nonnull swiftself %1, ptr addrspace(11) nocapture noundef nonnull readonly align 2 dereferenceable(16) %2, ptr addrspace(11) nocapture noundef nonnull readonly align 2 dereferenceable(16) %3, i64 signext %4) #0 !dbg !6 {
    %6 = call ptr @julia.get_pgcstack()
    %7 = getelementptr inbounds i8, ptr %6, i64 16
    %8 = load ptr, ptr %7, align 8, !tbaa !9
    %9 = getelementptr inbounds i8, ptr %8, i64 16
    %10 = load ptr, ptr %9, align 8, !tbaa !13, !invariant.load !0
    fence syncscope("singlethread") seq_cst
    call void @julia.safepoint(ptr %10), !dbg !15
    fence syncscope("singlethread") seq_cst
    %11 = trunc i64 %4 to i16, !dbg !16
-   %12 = icmp slt i16 %11, 1, !dbg !20
+   %12 = icmp eq i16 %11, 0, !dbg !20
    %13 = load i16, ptr addrspace(11) %2, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
    %14 = load i16, ptr addrspace(11) %3, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
    %15 = select i1 %12, i16 %14, i16 %13, !dbg !30
-   %16 = icmp slt i16 %11, 2, !dbg !20
+   %16 = icmp ult i16 %11, 2, !dbg !20
    %17 = getelementptr inbounds i8, ptr addrspace(11) %2, i64 2, !dbg !41
    %18 = getelementptr inbounds i8, ptr addrspace(11) %3, i64 2, !dbg !41
    %19 = load i16, ptr addrspace(11) %17, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
    %20 = load i16, ptr addrspace(11) %18, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
    %21 = select i1 %16, i16 %20, i16 %19, !dbg !30
-   %22 = icmp slt i16 %11, 3, !dbg !20
+   %22 = icmp ult i16 %11, 3, !dbg !20
...

As you could see, premature optimizations are done to icmp ult i16 %11, 0, transforming it into icmp eq i16 %11, 0 before vectorization. Notice all other icmp instructions of g(t, t, i) are indeed using ults as expected. It should be this specific simplification pass which came before the vectorization pass that broke proper vectorization.

Further investigations with export JULIA_LLVM_ARGS=-print-changed; julia g.jl pinpoints the exact cause of this, validating my suspections. This happens in the LLVM InstCombine optimization pass.

--- before premature instcombine
+++  after premature instcombine
- *** IR Dump After SROAPass on julia_g_65 ***
+ *** IR Dump After InstCombinePass on julia_g_65 ***
  define swiftcc void @julia_g_65(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %0, ptr nonnull swiftself %1, ptr addrspace(11) nocapture noundef nonnull readonly align 2 dereferenceable(16) %2, ptr addrspace(11) nocapture noundef nonnull readonly align 2 dereferenceable(16) %3, i64 signext %4) #0 !dbg !6 {
    ...
-   %12 = trunc i64 %4 to i16, !dbg !16
-   %13 = icmp ule i16 1, %12, !dbg !20
-   %14 = xor i1 %13, true, !dbg !30
-   %15 = load i16, ptr addrspace(11) %2, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
-   %16 = load i16, ptr addrspace(11) %3, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
-   %17 = select i1 %14, i16 %16, i16 %15, !dbg !30
-   %18 = icmp ule i16 2, %12, !dbg !20
+   %11 = trunc i64 %4 to i16, !dbg !16
+   %12 = icmp eq i16 %11, 0, !dbg !20
+   %13 = load i16, ptr addrspace(11) %2, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
+   %14 = load i16, ptr addrspace(11) %3, align 2, !dbg !30, !tbaa !13, !invariant.load !0, !alias.scope !33, !noalias !36
+   %15 = select i1 %12, i16 %14, i16 %13, !dbg !30
+   %16 = icmp ult i16 %11, 2, !dbg !20
...

This premature optimization of icmp ule i16 1, % into icmp eq i16 %, 0 is the actual culprit of the weird vectorization, because one of the instructions is not ult.

Therefore, it is indeed a LLVM bug, but I doubt if the optimization pass order could be adjusted to fix this problem. I opened an issue on LLVM.

1 Like