Poor vectorization when comparing unsigned integers

The following almost identical functions lead to quite different LLVM IR:

function f(v::NTuple{N}, w::NTuple{N}, i) where N
    i = i % Int16
    ntuple(Val(N)) do j
        j = j % Int16
        ifelse(j <= i, v[j],  w[j])
    end
end

function g(v::NTuple{N}, w::NTuple{N}, i) where N
    i = i % UInt16
    ntuple(Val(N)) do j
        j = j % UInt16
        ifelse(j <= i, v[j],  w[j])
    end
end

With

t = ntuple(Int16, 8)  # same for UInt16
i = 3

I get nicely vectorized code for f (using Int16)

julia> @code_llvm f(t, t, i)
; Function Signature: f(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
;  @ REPL[1]:1 within `f`
define void @julia_f_3601(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
;  @ REPL[1]:2 within `f`
; β”Œ @ int.jl:550 within `rem`
   %0 = trunc i64 %"i::Int64" to i16
; β””
;  @ REPL[1]:3 within `f`
; β”Œ @ ntuple.jl:71 within `ntuple`
; β”‚β”Œ @ ntuple.jl:74 within `macro expansion`
; β”‚β”‚β”Œ @ REPL[1]:5 within `#f##0`
; β”‚β”‚β”‚β”Œ @ int.jl:520 within `<=`
      %1 = insertelement <8 x i16> poison, i16 %0, i64 0
      %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> zeroinitializer
      %3 = icmp slt <8 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:799 within `ifelse`
      %4 = load <8 x i16>, ptr %"v::Tuple", align 2
      %5 = load <8 x i16>, ptr %"w::Tuple", align 2
      %6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4
; β”‚β”‚β””β””
    store <8 x i16> %6, ptr %sret_return, align 2
    ret void
; β””β””
}

but different IR for g (using UInt16)

julia> @code_llvm g(t, t, i)
; Function Signature: g(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
;  @ REPL[2]:1 within `g`
define void @julia_g_3634(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
;  @ REPL[2]:2 within `g`
; β”Œ @ int.jl:550 within `rem`
   %0 = trunc i64 %"i::Int64" to i16
; β””
;  @ REPL[2]:3 within `g`
; β”Œ @ ntuple.jl:71 within `ntuple`
; β”‚β”Œ @ ntuple.jl:74 within `macro expansion`
; β”‚β”‚β”Œ @ REPL[2]:5 within `#g##0`
; β”‚β”‚β”‚β”Œ @ int.jl:521 within `<=`
      %1 = insertelement <8 x i16> <i16 0, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>, i16 %0, i64 1
      %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
      %3 = insertelement <8 x i16> <i16 poison, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 %0, i64 0
      %4 = icmp eq <8 x i16> %2, %3
      %5 = icmp ult <8 x i16> %2, %3
      %6 = shufflevector <8 x i1> %4, <8 x i1> %5, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:799 within `ifelse`
      %7 = load <8 x i16>, ptr %"v::Tuple", align 2
      %8 = load <8 x i16>, ptr %"w::Tuple", align 2
      %9 = select <8 x i1> %6, <8 x i16> %8, <8 x i16> %7
; β”‚β”‚β””β””
    store <8 x i16> %9, ptr %sret_return, align 2
    ret void
; β””β””
}

In assembly instructions the difference is quite marked, and for tuples of length 16 the function g is more than 50% slower than f. I don’t understand why the IR for g is not analogous to that for f, with icmp slt replaced by icmp ult. Is this an LLVM bug?

1 Like