The following almost identical functions lead to quite different LLVM IR:
function f(v::NTuple{N}, w::NTuple{N}, i) where N
i = i % Int16
ntuple(Val(N)) do j
j = j % Int16
ifelse(j <= i, v[j], w[j])
end
end
function g(v::NTuple{N}, w::NTuple{N}, i) where N
i = i % UInt16
ntuple(Val(N)) do j
j = j % UInt16
ifelse(j <= i, v[j], w[j])
end
end
With
t = ntuple(Int16, 8) # same for UInt16
i = 3
I get nicely vectorized code for f
(using Int16
)
julia> @code_llvm f(t, t, i)
; Function Signature: f(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
; @ REPL[1]:1 within `f`
define void @julia_f_3601(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
; @ REPL[1]:2 within `f`
; β @ int.jl:550 within `rem`
%0 = trunc i64 %"i::Int64" to i16
; β
; @ REPL[1]:3 within `f`
; β @ ntuple.jl:71 within `ntuple`
; ββ @ ntuple.jl:74 within `macro expansion`
; βββ @ REPL[1]:5 within `#f##0`
; ββββ @ int.jl:520 within `<=`
%1 = insertelement <8 x i16> poison, i16 %0, i64 0
%2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> zeroinitializer
%3 = icmp slt <8 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
; ββββ
; ββββ @ essentials.jl:799 within `ifelse`
%4 = load <8 x i16>, ptr %"v::Tuple", align 2
%5 = load <8 x i16>, ptr %"w::Tuple", align 2
%6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4
; ββββ
store <8 x i16> %6, ptr %sret_return, align 2
ret void
; ββ
}
but different IR for g
(using UInt16
)
julia> @code_llvm g(t, t, i)
; Function Signature: g(NTuple{8, Int16}, NTuple{8, Int16}, Int64)
; @ REPL[2]:1 within `g`
define void @julia_g_3634(ptr noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %sret_return, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"v::Tuple", ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %"w::Tuple", i64 signext %"i::Int64") #0 {
top:
; @ REPL[2]:2 within `g`
; β @ int.jl:550 within `rem`
%0 = trunc i64 %"i::Int64" to i16
; β
; @ REPL[2]:3 within `g`
; β @ ntuple.jl:71 within `ntuple`
; ββ @ ntuple.jl:74 within `macro expansion`
; βββ @ REPL[2]:5 within `#g##0`
; ββββ @ int.jl:521 within `<=`
%1 = insertelement <8 x i16> <i16 0, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>, i16 %0, i64 1
%2 = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%3 = insertelement <8 x i16> <i16 poison, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 %0, i64 0
%4 = icmp eq <8 x i16> %2, %3
%5 = icmp ult <8 x i16> %2, %3
%6 = shufflevector <8 x i1> %4, <8 x i1> %5, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; ββββ
; ββββ @ essentials.jl:799 within `ifelse`
%7 = load <8 x i16>, ptr %"v::Tuple", align 2
%8 = load <8 x i16>, ptr %"w::Tuple", align 2
%9 = select <8 x i1> %6, <8 x i16> %8, <8 x i16> %7
; ββββ
store <8 x i16> %9, ptr %sret_return, align 2
ret void
; ββ
}
In assembly instructions the difference is quite marked, and for tuples of length 16 the function g
is more than 50% slower than f
. I donβt understand why the IR for g
is not analogous to that for f
, with icmp slt
replaced by icmp ult
. Is this an LLVM bug?