Poor vectorization when comparing unsigned integers

matthias314 · October 13, 2025, 9:40pm

The following almost identical functions lead to quite different LLVM IR:

function f(v::NTuple{N}, w::NTuple{N}, i) where N
    i = i % Int16
    ntuple(Val(N)) do j
        j = j % Int16
        ifelse(j <= i, v[j],  w[j])
    end
end

function g(v::NTuple{N}, w::NTuple{N}, i) where N
    i = i % UInt16
    ntuple(Val(N)) do j
        j = j % UInt16
        ifelse(j <= i, v[j],  w[j])
    end
end

With

t = ntuple(Int16, 8)  # same for UInt16
i = 3

I get nicely vectorized code for f (using Int16)

julia> @code_llvm f(t, t, i)
; Function Signature: f(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
;  @ REPL[1]:1 within `f`
define void @julia_f_3601(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
;  @ REPL[1]:2 within `f`
; ┌ @ int.jl:550 within `rem`
   %0 = trunc i64 %"i::Int64" to i16
; └
;  @ REPL[1]:3 within `f`
; ┌ @ ntuple.jl:71 within `ntuple`
; │┌ @ ntuple.jl:74 within `macro expansion`
; ││┌ @ REPL[1]:5 within `#f##0`
; │││┌ @ int.jl:520 within `<=`
      %1 = insertelement <8 x i16> poison, i16 %0, i64 0
      %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> zeroinitializer
      %3 = icmp slt <8 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
; │││└
; │││┌ @ essentials.jl:799 within `ifelse`
      %4 = load <8 x i16>, ptr %"v::Tuple", align 2
      %5 = load <8 x i16>, ptr %"w::Tuple", align 2
      %6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4
; ││└└
    store <8 x i16> %6, ptr %sret_return, align 2
    ret void
; └└
}

but different IR for g (using UInt16)

julia> @code_llvm g(t, t, i)
; Function Signature: g(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
;  @ REPL[2]:1 within `g`
define void @julia_g_3634(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
;  @ REPL[2]:2 within `g`
; ┌ @ int.jl:550 within `rem`
   %0 = trunc i64 %"i::Int64" to i16
; └
;  @ REPL[2]:3 within `g`
; ┌ @ ntuple.jl:71 within `ntuple`
; │┌ @ ntuple.jl:74 within `macro expansion`
; ││┌ @ REPL[2]:5 within `#g##0`
; │││┌ @ int.jl:521 within `<=`
      %1 = insertelement <8 x i16> <i16 0, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>, i16 %0, i64 1
      %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
      %3 = insertelement <8 x i16> <i16 poison, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 %0, i64 0
      %4 = icmp eq <8 x i16> %2, %3
      %5 = icmp ult <8 x i16> %2, %3
      %6 = shufflevector <8 x i1> %4, <8 x i1> %5, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; │││└
; │││┌ @ essentials.jl:799 within `ifelse`
      %7 = load <8 x i16>, ptr %"v::Tuple", align 2
      %8 = load <8 x i16>, ptr %"w::Tuple", align 2
      %9 = select <8 x i1> %6, <8 x i16> %8, <8 x i16> %7
; ││└└
    store <8 x i16> %9, ptr %sret_return, align 2
    ret void
; └└
}

In assembly instructions the difference is quite marked, and for tuples of length 16 the function g is more than 50% slower than f. I don’t understand why the IR for g is not analogous to that for f, with icmp slt replaced by icmp ult. Is this an LLVM bug?

Topic		Replies	Views
Consistent performance difference but identical LLVM IR Performance	4	479	October 8, 2019
Possibly mildly faster sign for integers? Performance	12	420	April 15, 2025
Bit manipulations with llvmcall give strange results General Usage llvm	11	357	March 9, 2024
Convert UInt16 to two UInt8 New to Julia	15	6990	November 20, 2017
Simple loop won't vectorize New to Julia	12	1695	January 29, 2019

Poor vectorization when comparing unsigned integers

Related topics