Another case of poor vectorization

I see poor vectorization when creating ranges as tuples. The flavor is similar to this issue, but the details are different. This time there are no comparisons involved (which were the culprit in the linked post).

function tuple_range(start::Int8, step::Int8 = Int8(1))
    t = Base.llvmcall("""
        %s  = insertelement <16 x i8> poison, i8 %1, i8 0
        %sv = shufflevector <16 x i8> %s, <16 x i8> poison, <16 x i32> zeroinitializer
        %u  = mul <16 x i8> %sv, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>
        %x  = insertelement <16 x i8> poison, i8 %0, i8 0
        %xv = shufflevector <16 x i8> %x, <16 x i8> poison, <16 x i32> zeroinitializer
        %w  = add <16 x i8> %u, %xv
        ret <16 x i8> %w
    """, NTuple{16, VecElement{Int8}}, Tuple{Int8, Int8}, start, step)
    ntuple(i -> t[i].value, Val(16))  # remove VecElement
end

Example:

julia> tuple_range(Int8(1), Int8(2))
(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31)

julia> tuple_range(Int8(1))
(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)

The IR for the 2-argument method looks good:

julia> @code_llvm tuple_range(Int8(1), Int8(2))
; Function Signature: tuple_range(Int8, Int8)
;  @ REPL[1]:1 within `tuple_range`
define void @julia_tuple_range_1635(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %sret_return, i8 signext %"start::Int8", i8 signext %"step::Int8") #0 {
top:
;  @ REPL[1]:2 within `tuple_range`
  %s.i = insertelement <16 x i8> poison, i8 %"step::Int8", i64 0
  %sv.i = shufflevector <16 x i8> %s.i, <16 x i8> poison, <16 x i32> zeroinitializer
  %u.i = mul <16 x i8> %sv.i, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>
  %x.i = insertelement <16 x i8> poison, i8 %"start::Int8", i64 0
  %xv.i = shufflevector <16 x i8> %x.i, <16 x i8> poison, <16 x i32> zeroinitializer
  %w.i = add <16 x i8> %u.i, %xv.i
;  @ REPL[1]:11 within `tuple_range`
; ┌ @ ntuple.jl:71 within `ntuple`
; │┌ @ ntuple.jl:74 within `macro expansion`
    store <16 x i8> %w.i, ptr %sret_return, align 1
    ret void
; └└
}

but the one for the 1-argument method does not:

julia> @code_llvm tuple_range(Int8(1))
; Function Signature: tuple_range(Int8)
;  @ REPL[1]:1 within `tuple_range`
define void @julia_tuple_range_1654(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %sret_return, i8 signext %"start::Int8") #0 {
top:
;  @ REPL[1]:2 within `tuple_range` @ REPL[1]:2
  %x.i = insertelement <16 x i8> poison, i8 %"start::Int8", i64 0
  %xv.i = shufflevector <16 x i8> %x.i, <16 x i8> poison, <16 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  %w.i = add <16 x i8> %xv.i, <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>
;  @ REPL[1]:2 within `tuple_range` @ REPL[1]:11
; ┌ @ ntuple.jl:71 within `ntuple`
; │┌ @ ntuple.jl:74 within `macro expansion`
; ││┌ @ REPL[1]:11 within `#tuple_range##0`
; │││┌ @ tuple.jl:33 within `getindex`
      %"[14]" = extractelement <16 x i8> %w.i, i64 13
      %"[15]" = extractelement <16 x i8> %w.i, i64 14
      %"[16]" = extractelement <16 x i8> %w.i, i64 15
; ││└└
    store i8 %"start::Int8", ptr %sret_return, align 1
    %"new::Tuple.sroa.2.0.sret_return.sroa_idx" = getelementptr inbounds i8, ptr %sret_return, i64 1
    %0 = shufflevector <16 x i8> %w.i, <16 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    store <8 x i8> %0, ptr %"new::Tuple.sroa.2.0.sret_return.sroa_idx", align 1
    %"new::Tuple.sroa.10.0.sret_return.sroa_idx" = getelementptr inbounds i8, ptr %sret_return, i64 9
    %1 = shufflevector <16 x i8> %w.i, <16 x i8> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 12>
    store <4 x i8> %1, ptr %"new::Tuple.sroa.10.0.sret_return.sroa_idx", align 1
    %"new::Tuple.sroa.14.0.sret_return.sroa_idx" = getelementptr inbounds i8, ptr %sret_return, i64 13
    store i8 %"[14]", ptr %"new::Tuple.sroa.14.0.sret_return.sroa_idx", align 1
    %"new::Tuple.sroa.15.0.sret_return.sroa_idx" = getelementptr inbounds i8, ptr %sret_return, i64 14
    store i8 %"[15]", ptr %"new::Tuple.sroa.15.0.sret_return.sroa_idx", align 1
    %"new::Tuple.sroa.16.0.sret_return.sroa_idx" = getelementptr inbounds i8, ptr %sret_return, i64 15
    store i8 %"[16]", ptr %"new::Tuple.sroa.16.0.sret_return.sroa_idx", align 1
    ret void
; └└
}

Is this again an LLVM bug? And is there a way to force better vectorization from the Julia side?

This is another instance of the InstCombine pass hindering SLP-Vectorization.

Comparison of both versions' LLVM IR before a certain InstCombine pass
--- tuple_range/2-argument
+++ tuple_range/1-argument
- define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2, i8 signext %3) #0 !dbg !8 {
-   %5 = call ptr @julia.get_pgcstack()
-   %6 = getelementptr inbounds i8, ptr %5, i32 -152
-   %7 = getelementptr inbounds i8, ptr %6, i32 168
-   %8 = load ptr, ptr %7, align 8, !tbaa !11
-   %9 = getelementptr inbounds i8, ptr %8, i32 16
-   %10 = load ptr, ptr %9, align 8, !tbaa !15, !invariant.load !0
+ define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2) #0 !dbg !8 {
+   %4 = call ptr @julia.get_pgcstack()
+   %5 = getelementptr inbounds i8, ptr %4, i32 -152
+   %6 = getelementptr inbounds i8, ptr %5, i32 168
+   %7 = load ptr, ptr %6, align 8, !tbaa !11
+   %8 = getelementptr inbounds i8, ptr %7, i32 16
+   %9 = load ptr, ptr %8, align 8, !tbaa !15, !invariant.load !0
    fence syncscope("singlethread") seq_cst
-   call void @julia.safepoint(ptr %10), !dbg !17
+   call void @julia.safepoint(ptr %9), !dbg !17
    fence syncscope("singlethread") seq_cst
-   %11 = insertelement <16 x i8> poison, i8 %3, i8 0, !dbg !18
-   %12 = shufflevector <16 x i8> %11, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
-   %13 = mul <16 x i8> %12, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, !dbg !18
-   %14 = insertelement <16 x i8> poison, i8 %2, i8 0, !dbg !18
-   %15 = shufflevector <16 x i8> %14, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
-   %16 = add <16 x i8> %13, %15, !dbg !18
-   %17 = extractelement <16 x i8> %16, i32 0, !dbg !19
-   %18 = extractelement <16 x i8> %16, i32 1, !dbg !19
-   %19 = extractelement <16 x i8> %16, i32 2, !dbg !19
-   %20 = extractelement <16 x i8> %16, i32 3, !dbg !19
-   %21 = extractelement <16 x i8> %16, i32 4, !dbg !19
-   %22 = extractelement <16 x i8> %16, i32 5, !dbg !19
-   %23 = extractelement <16 x i8> %16, i32 6, !dbg !19
-   %24 = extractelement <16 x i8> %16, i32 7, !dbg !19
-   %25 = extractelement <16 x i8> %16, i32 8, !dbg !19
-   %26 = extractelement <16 x i8> %16, i32 9, !dbg !19
-   %27 = extractelement <16 x i8> %16, i32 10, !dbg !19
-   %28 = extractelement <16 x i8> %16, i32 11, !dbg !19
-   %29 = extractelement <16 x i8> %16, i32 12, !dbg !19
-   %30 = extractelement <16 x i8> %16, i32 13, !dbg !19
-   %31 = extractelement <16 x i8> %16, i32 14, !dbg !19
-   %32 = extractelement <16 x i8> %16, i32 15, !dbg !19
-   store i8 %17, ptr %0, align 1, !dbg !24
-   %33 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !24
-   store i8 %18, ptr %33, align 1, !dbg !24
-   %34 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !24
-   store i8 %19, ptr %34, align 1, !dbg !24
-   %35 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !24
-   store i8 %20, ptr %35, align 1, !dbg !24
-   %36 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !24
-   store i8 %21, ptr %36, align 1, !dbg !24
-   %37 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !24
-   store i8 %22, ptr %37, align 1, !dbg !24
-   %38 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !24
-   store i8 %23, ptr %38, align 1, !dbg !24
-   %39 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !24
-   store i8 %24, ptr %39, align 1, !dbg !24
-   %40 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !24
-   store i8 %25, ptr %40, align 1, !dbg !24
-   %41 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !24
-   store i8 %26, ptr %41, align 1, !dbg !24
-   %42 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !24
-   store i8 %27, ptr %42, align 1, !dbg !24
-   %43 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !24
-   store i8 %28, ptr %43, align 1, !dbg !24
-   %44 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !24
-   store i8 %29, ptr %44, align 1, !dbg !24
-   %45 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !24
-   store i8 %30, ptr %45, align 1, !dbg !24
-   %46 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !24
-   store i8 %31, ptr %46, align 1, !dbg !24
-   %47 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !24
-   store i8 %32, ptr %47, align 1, !dbg !24
-   ret void, !dbg !24
+   %10 = insertelement <16 x i8> poison, i8 %2, i8 0, !dbg !18
+   %11 = shufflevector <16 x i8> %10, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
+   %12 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %11, !dbg !18
+   %13 = extractelement <16 x i8> %12, i32 0, !dbg !21
+   %14 = extractelement <16 x i8> %12, i32 1, !dbg !21
+   %15 = extractelement <16 x i8> %12, i32 2, !dbg !21
+   %16 = extractelement <16 x i8> %12, i32 3, !dbg !21
+   %17 = extractelement <16 x i8> %12, i32 4, !dbg !21
+   %18 = extractelement <16 x i8> %12, i32 5, !dbg !21
+   %19 = extractelement <16 x i8> %12, i32 6, !dbg !21
+   %20 = extractelement <16 x i8> %12, i32 7, !dbg !21
+   %21 = extractelement <16 x i8> %12, i32 8, !dbg !21
+   %22 = extractelement <16 x i8> %12, i32 9, !dbg !21
+   %23 = extractelement <16 x i8> %12, i32 10, !dbg !21
+   %24 = extractelement <16 x i8> %12, i32 11, !dbg !21
+   %25 = extractelement <16 x i8> %12, i32 12, !dbg !21
+   %26 = extractelement <16 x i8> %12, i32 13, !dbg !21
+   %27 = extractelement <16 x i8> %12, i32 14, !dbg !21
+   %28 = extractelement <16 x i8> %12, i32 15, !dbg !21
+   store i8 %13, ptr %0, align 1, !dbg !26
+   %29 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !26
+   store i8 %14, ptr %29, align 1, !dbg !26
+   %30 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !26
+   store i8 %15, ptr %30, align 1, !dbg !26
+   %31 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !26
+   store i8 %16, ptr %31, align 1, !dbg !26
+   %32 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !26
+   store i8 %17, ptr %32, align 1, !dbg !26
+   %33 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !26
+   store i8 %18, ptr %33, align 1, !dbg !26
+   %34 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !26
+   store i8 %19, ptr %34, align 1, !dbg !26
+   %35 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !26
+   store i8 %20, ptr %35, align 1, !dbg !26
+   %36 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !26
+   store i8 %21, ptr %36, align 1, !dbg !26
+   %37 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !26
+   store i8 %22, ptr %37, align 1, !dbg !26
+   %38 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !26
+   store i8 %23, ptr %38, align 1, !dbg !26
+   %39 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !26
+   store i8 %24, ptr %39, align 1, !dbg !26
+   %40 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !26
+   store i8 %25, ptr %40, align 1, !dbg !26
+   %41 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !26
+   store i8 %26, ptr %41, align 1, !dbg !26
+   %42 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !26
+   store i8 %27, ptr %42, align 1, !dbg !26
+   %43 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !26
+   store i8 %28, ptr %43, align 1, !dbg !26
+   ret void, !dbg !26
  }
Comparison of both versions' LLVM IR after a certain InstCombine pass
--- tuple_range/2-argument
+++ tuple_range/1-argument
- define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2, i8 signext %3) #0 !dbg !8 {
-   %5 = call ptr @julia.get_pgcstack()
-   %6 = getelementptr inbounds i8, ptr %5, i64 16
-   %7 = load ptr, ptr %6, align 8, !tbaa !11
-   %8 = getelementptr inbounds i8, ptr %7, i64 16
-   %9 = load ptr, ptr %8, align 8, !tbaa !15, !invariant.load !0
+ define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2) #0 !dbg !8 {
+   %4 = call ptr @julia.get_pgcstack()
+   %5 = getelementptr inbounds i8, ptr %4, i64 16
+   %6 = load ptr, ptr %5, align 8, !tbaa !11
+   %7 = getelementptr inbounds i8, ptr %6, i64 16
+   %8 = load ptr, ptr %7, align 8, !tbaa !15, !invariant.load !0
    fence syncscope("singlethread") seq_cst
-   call void @julia.safepoint(ptr %9), !dbg !17
+   call void @julia.safepoint(ptr %8), !dbg !17
    fence syncscope("singlethread") seq_cst
-   %10 = insertelement <16 x i8> poison, i8 %3, i64 0, !dbg !18
-   %11 = shufflevector <16 x i8> %10, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
-   %12 = mul <16 x i8> %11, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, !dbg !18
-   %13 = insertelement <16 x i8> poison, i8 %2, i64 0, !dbg !18
-   %14 = shufflevector <16 x i8> %13, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
-   %15 = add <16 x i8> %12, %14, !dbg !18
-   %16 = extractelement <16 x i8> %15, i64 0, !dbg !19
-   %17 = extractelement <16 x i8> %15, i64 1, !dbg !19
-   %18 = extractelement <16 x i8> %15, i64 2, !dbg !19
-   %19 = extractelement <16 x i8> %15, i64 3, !dbg !19
-   %20 = extractelement <16 x i8> %15, i64 4, !dbg !19
-   %21 = extractelement <16 x i8> %15, i64 5, !dbg !19
-   %22 = extractelement <16 x i8> %15, i64 6, !dbg !19
-   %23 = extractelement <16 x i8> %15, i64 7, !dbg !19
-   %24 = extractelement <16 x i8> %15, i64 8, !dbg !19
-   %25 = extractelement <16 x i8> %15, i64 9, !dbg !19
-   %26 = extractelement <16 x i8> %15, i64 10, !dbg !19
-   %27 = extractelement <16 x i8> %15, i64 11, !dbg !19
-   %28 = extractelement <16 x i8> %15, i64 12, !dbg !19
-   %29 = extractelement <16 x i8> %15, i64 13, !dbg !19
-   %30 = extractelement <16 x i8> %15, i64 14, !dbg !19
-   %31 = extractelement <16 x i8> %15, i64 15, !dbg !19
-   store i8 %16, ptr %0, align 1, !dbg !24
-   %32 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !24
-   store i8 %17, ptr %32, align 1, !dbg !24
-   %33 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !24
-   store i8 %18, ptr %33, align 1, !dbg !24
-   %34 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !24
-   store i8 %19, ptr %34, align 1, !dbg !24
-   %35 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !24
-   store i8 %20, ptr %35, align 1, !dbg !24
-   %36 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !24
-   store i8 %21, ptr %36, align 1, !dbg !24
-   %37 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !24
-   store i8 %22, ptr %37, align 1, !dbg !24
-   %38 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !24
-   store i8 %23, ptr %38, align 1, !dbg !24
-   %39 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !24
-   store i8 %24, ptr %39, align 1, !dbg !24
-   %40 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !24
-   store i8 %25, ptr %40, align 1, !dbg !24
-   %41 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !24
-   store i8 %26, ptr %41, align 1, !dbg !24
-   %42 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !24
-   store i8 %27, ptr %42, align 1, !dbg !24
-   %43 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !24
-   store i8 %28, ptr %43, align 1, !dbg !24
-   %44 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !24
-   store i8 %29, ptr %44, align 1, !dbg !24
-   %45 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !24
-   store i8 %30, ptr %45, align 1, !dbg !24
-   %46 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !24
-   store i8 %31, ptr %46, align 1, !dbg !24
-   ret void, !dbg !24
+   %9 = insertelement <16 x i8> poison, i8 %2, i64 0, !dbg !18
+   %10 = shufflevector <16 x i8> %9, <16 x i8> poison, <16 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, !dbg !18
+   %11 = add <16 x i8> %10, <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, !dbg !18
+   %12 = extractelement <16 x i8> %11, i64 1, !dbg !21
+   %13 = extractelement <16 x i8> %11, i64 2, !dbg !21
+   %14 = extractelement <16 x i8> %11, i64 3, !dbg !21
+   %15 = extractelement <16 x i8> %11, i64 4, !dbg !21
+   %16 = extractelement <16 x i8> %11, i64 5, !dbg !21
+   %17 = extractelement <16 x i8> %11, i64 6, !dbg !21
+   %18 = extractelement <16 x i8> %11, i64 7, !dbg !21
+   %19 = extractelement <16 x i8> %11, i64 8, !dbg !21
+   %20 = extractelement <16 x i8> %11, i64 9, !dbg !21
+   %21 = extractelement <16 x i8> %11, i64 10, !dbg !21
+   %22 = extractelement <16 x i8> %11, i64 11, !dbg !21
+   %23 = extractelement <16 x i8> %11, i64 12, !dbg !21
+   %24 = extractelement <16 x i8> %11, i64 13, !dbg !21
+   %25 = extractelement <16 x i8> %11, i64 14, !dbg !21
+   %26 = extractelement <16 x i8> %11, i64 15, !dbg !21
+   store i8 %2, ptr %0, align 1, !dbg !26
+   %27 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !26
+   store i8 %12, ptr %27, align 1, !dbg !26
+   %28 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !26
+   store i8 %13, ptr %28, align 1, !dbg !26
+   %29 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !26
+   store i8 %14, ptr %29, align 1, !dbg !26
+   %30 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !26
+   store i8 %15, ptr %30, align 1, !dbg !26
+   %31 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !26
+   store i8 %16, ptr %31, align 1, !dbg !26
+   %32 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !26
+   store i8 %17, ptr %32, align 1, !dbg !26
+   %33 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !26
+   store i8 %18, ptr %33, align 1, !dbg !26
+   %34 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !26
+   store i8 %19, ptr %34, align 1, !dbg !26
+   %35 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !26
+   store i8 %20, ptr %35, align 1, !dbg !26
+   %36 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !26
+   store i8 %21, ptr %36, align 1, !dbg !26
+   %37 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !26
+   store i8 %22, ptr %37, align 1, !dbg !26
+   %38 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !26
+   store i8 %23, ptr %38, align 1, !dbg !26
+   %39 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !26
+   store i8 %24, ptr %39, align 1, !dbg !26
+   %40 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !26
+   store i8 %25, ptr %40, align 1, !dbg !26
+   %41 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !26
+   store i8 %26, ptr %41, align 1, !dbg !26
+   ret void, !dbg !26
  }

Notice how extractelement <16 x i8> %v, i16 0 in the 1-argument version got removed since it equals %start. This replaces store i8 %"v[0]", ptr %0, align 1 with store i8 %start, ptr %0, align 1. The SLP-Vectorizer cannot handle this properly since not all elements to be stored are from the same vector. Godbolt: Compiler Explorer. I opened an issue on LLVM.