This is another instance of the InstCombine pass hindering SLP-Vectorization.
Comparison of both versions' LLVM IR before a certain InstCombine pass
--- tuple_range/2-argument
+++ tuple_range/1-argument
- define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2, i8 signext %3) #0 !dbg !8 {
- %5 = call ptr @julia.get_pgcstack()
- %6 = getelementptr inbounds i8, ptr %5, i32 -152
- %7 = getelementptr inbounds i8, ptr %6, i32 168
- %8 = load ptr, ptr %7, align 8, !tbaa !11
- %9 = getelementptr inbounds i8, ptr %8, i32 16
- %10 = load ptr, ptr %9, align 8, !tbaa !15, !invariant.load !0
+ define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2) #0 !dbg !8 {
+ %4 = call ptr @julia.get_pgcstack()
+ %5 = getelementptr inbounds i8, ptr %4, i32 -152
+ %6 = getelementptr inbounds i8, ptr %5, i32 168
+ %7 = load ptr, ptr %6, align 8, !tbaa !11
+ %8 = getelementptr inbounds i8, ptr %7, i32 16
+ %9 = load ptr, ptr %8, align 8, !tbaa !15, !invariant.load !0
fence syncscope("singlethread") seq_cst
- call void @julia.safepoint(ptr %10), !dbg !17
+ call void @julia.safepoint(ptr %9), !dbg !17
fence syncscope("singlethread") seq_cst
- %11 = insertelement <16 x i8> poison, i8 %3, i8 0, !dbg !18
- %12 = shufflevector <16 x i8> %11, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
- %13 = mul <16 x i8> %12, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, !dbg !18
- %14 = insertelement <16 x i8> poison, i8 %2, i8 0, !dbg !18
- %15 = shufflevector <16 x i8> %14, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
- %16 = add <16 x i8> %13, %15, !dbg !18
- %17 = extractelement <16 x i8> %16, i32 0, !dbg !19
- %18 = extractelement <16 x i8> %16, i32 1, !dbg !19
- %19 = extractelement <16 x i8> %16, i32 2, !dbg !19
- %20 = extractelement <16 x i8> %16, i32 3, !dbg !19
- %21 = extractelement <16 x i8> %16, i32 4, !dbg !19
- %22 = extractelement <16 x i8> %16, i32 5, !dbg !19
- %23 = extractelement <16 x i8> %16, i32 6, !dbg !19
- %24 = extractelement <16 x i8> %16, i32 7, !dbg !19
- %25 = extractelement <16 x i8> %16, i32 8, !dbg !19
- %26 = extractelement <16 x i8> %16, i32 9, !dbg !19
- %27 = extractelement <16 x i8> %16, i32 10, !dbg !19
- %28 = extractelement <16 x i8> %16, i32 11, !dbg !19
- %29 = extractelement <16 x i8> %16, i32 12, !dbg !19
- %30 = extractelement <16 x i8> %16, i32 13, !dbg !19
- %31 = extractelement <16 x i8> %16, i32 14, !dbg !19
- %32 = extractelement <16 x i8> %16, i32 15, !dbg !19
- store i8 %17, ptr %0, align 1, !dbg !24
- %33 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !24
- store i8 %18, ptr %33, align 1, !dbg !24
- %34 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !24
- store i8 %19, ptr %34, align 1, !dbg !24
- %35 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !24
- store i8 %20, ptr %35, align 1, !dbg !24
- %36 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !24
- store i8 %21, ptr %36, align 1, !dbg !24
- %37 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !24
- store i8 %22, ptr %37, align 1, !dbg !24
- %38 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !24
- store i8 %23, ptr %38, align 1, !dbg !24
- %39 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !24
- store i8 %24, ptr %39, align 1, !dbg !24
- %40 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !24
- store i8 %25, ptr %40, align 1, !dbg !24
- %41 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !24
- store i8 %26, ptr %41, align 1, !dbg !24
- %42 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !24
- store i8 %27, ptr %42, align 1, !dbg !24
- %43 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !24
- store i8 %28, ptr %43, align 1, !dbg !24
- %44 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !24
- store i8 %29, ptr %44, align 1, !dbg !24
- %45 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !24
- store i8 %30, ptr %45, align 1, !dbg !24
- %46 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !24
- store i8 %31, ptr %46, align 1, !dbg !24
- %47 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !24
- store i8 %32, ptr %47, align 1, !dbg !24
- ret void, !dbg !24
+ %10 = insertelement <16 x i8> poison, i8 %2, i8 0, !dbg !18
+ %11 = shufflevector <16 x i8> %10, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
+ %12 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %11, !dbg !18
+ %13 = extractelement <16 x i8> %12, i32 0, !dbg !21
+ %14 = extractelement <16 x i8> %12, i32 1, !dbg !21
+ %15 = extractelement <16 x i8> %12, i32 2, !dbg !21
+ %16 = extractelement <16 x i8> %12, i32 3, !dbg !21
+ %17 = extractelement <16 x i8> %12, i32 4, !dbg !21
+ %18 = extractelement <16 x i8> %12, i32 5, !dbg !21
+ %19 = extractelement <16 x i8> %12, i32 6, !dbg !21
+ %20 = extractelement <16 x i8> %12, i32 7, !dbg !21
+ %21 = extractelement <16 x i8> %12, i32 8, !dbg !21
+ %22 = extractelement <16 x i8> %12, i32 9, !dbg !21
+ %23 = extractelement <16 x i8> %12, i32 10, !dbg !21
+ %24 = extractelement <16 x i8> %12, i32 11, !dbg !21
+ %25 = extractelement <16 x i8> %12, i32 12, !dbg !21
+ %26 = extractelement <16 x i8> %12, i32 13, !dbg !21
+ %27 = extractelement <16 x i8> %12, i32 14, !dbg !21
+ %28 = extractelement <16 x i8> %12, i32 15, !dbg !21
+ store i8 %13, ptr %0, align 1, !dbg !26
+ %29 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !26
+ store i8 %14, ptr %29, align 1, !dbg !26
+ %30 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !26
+ store i8 %15, ptr %30, align 1, !dbg !26
+ %31 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !26
+ store i8 %16, ptr %31, align 1, !dbg !26
+ %32 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !26
+ store i8 %17, ptr %32, align 1, !dbg !26
+ %33 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !26
+ store i8 %18, ptr %33, align 1, !dbg !26
+ %34 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !26
+ store i8 %19, ptr %34, align 1, !dbg !26
+ %35 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !26
+ store i8 %20, ptr %35, align 1, !dbg !26
+ %36 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !26
+ store i8 %21, ptr %36, align 1, !dbg !26
+ %37 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !26
+ store i8 %22, ptr %37, align 1, !dbg !26
+ %38 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !26
+ store i8 %23, ptr %38, align 1, !dbg !26
+ %39 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !26
+ store i8 %24, ptr %39, align 1, !dbg !26
+ %40 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !26
+ store i8 %25, ptr %40, align 1, !dbg !26
+ %41 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !26
+ store i8 %26, ptr %41, align 1, !dbg !26
+ %42 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !26
+ store i8 %27, ptr %42, align 1, !dbg !26
+ %43 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !26
+ store i8 %28, ptr %43, align 1, !dbg !26
+ ret void, !dbg !26
}
Comparison of both versions' LLVM IR after a certain InstCombine pass
--- tuple_range/2-argument
+++ tuple_range/1-argument
- define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2, i8 signext %3) #0 !dbg !8 {
- %5 = call ptr @julia.get_pgcstack()
- %6 = getelementptr inbounds i8, ptr %5, i64 16
- %7 = load ptr, ptr %6, align 8, !tbaa !11
- %8 = getelementptr inbounds i8, ptr %7, i64 16
- %9 = load ptr, ptr %8, align 8, !tbaa !15, !invariant.load !0
+ define swiftcc void @julia_tuple_range_40(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, ptr nonnull swiftself %1, i8 signext %2) #0 !dbg !8 {
+ %4 = call ptr @julia.get_pgcstack()
+ %5 = getelementptr inbounds i8, ptr %4, i64 16
+ %6 = load ptr, ptr %5, align 8, !tbaa !11
+ %7 = getelementptr inbounds i8, ptr %6, i64 16
+ %8 = load ptr, ptr %7, align 8, !tbaa !15, !invariant.load !0
fence syncscope("singlethread") seq_cst
- call void @julia.safepoint(ptr %9), !dbg !17
+ call void @julia.safepoint(ptr %8), !dbg !17
fence syncscope("singlethread") seq_cst
- %10 = insertelement <16 x i8> poison, i8 %3, i64 0, !dbg !18
- %11 = shufflevector <16 x i8> %10, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
- %12 = mul <16 x i8> %11, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, !dbg !18
- %13 = insertelement <16 x i8> poison, i8 %2, i64 0, !dbg !18
- %14 = shufflevector <16 x i8> %13, <16 x i8> poison, <16 x i32> zeroinitializer, !dbg !18
- %15 = add <16 x i8> %12, %14, !dbg !18
- %16 = extractelement <16 x i8> %15, i64 0, !dbg !19
- %17 = extractelement <16 x i8> %15, i64 1, !dbg !19
- %18 = extractelement <16 x i8> %15, i64 2, !dbg !19
- %19 = extractelement <16 x i8> %15, i64 3, !dbg !19
- %20 = extractelement <16 x i8> %15, i64 4, !dbg !19
- %21 = extractelement <16 x i8> %15, i64 5, !dbg !19
- %22 = extractelement <16 x i8> %15, i64 6, !dbg !19
- %23 = extractelement <16 x i8> %15, i64 7, !dbg !19
- %24 = extractelement <16 x i8> %15, i64 8, !dbg !19
- %25 = extractelement <16 x i8> %15, i64 9, !dbg !19
- %26 = extractelement <16 x i8> %15, i64 10, !dbg !19
- %27 = extractelement <16 x i8> %15, i64 11, !dbg !19
- %28 = extractelement <16 x i8> %15, i64 12, !dbg !19
- %29 = extractelement <16 x i8> %15, i64 13, !dbg !19
- %30 = extractelement <16 x i8> %15, i64 14, !dbg !19
- %31 = extractelement <16 x i8> %15, i64 15, !dbg !19
- store i8 %16, ptr %0, align 1, !dbg !24
- %32 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !24
- store i8 %17, ptr %32, align 1, !dbg !24
- %33 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !24
- store i8 %18, ptr %33, align 1, !dbg !24
- %34 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !24
- store i8 %19, ptr %34, align 1, !dbg !24
- %35 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !24
- store i8 %20, ptr %35, align 1, !dbg !24
- %36 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !24
- store i8 %21, ptr %36, align 1, !dbg !24
- %37 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !24
- store i8 %22, ptr %37, align 1, !dbg !24
- %38 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !24
- store i8 %23, ptr %38, align 1, !dbg !24
- %39 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !24
- store i8 %24, ptr %39, align 1, !dbg !24
- %40 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !24
- store i8 %25, ptr %40, align 1, !dbg !24
- %41 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !24
- store i8 %26, ptr %41, align 1, !dbg !24
- %42 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !24
- store i8 %27, ptr %42, align 1, !dbg !24
- %43 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !24
- store i8 %28, ptr %43, align 1, !dbg !24
- %44 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !24
- store i8 %29, ptr %44, align 1, !dbg !24
- %45 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !24
- store i8 %30, ptr %45, align 1, !dbg !24
- %46 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !24
- store i8 %31, ptr %46, align 1, !dbg !24
- ret void, !dbg !24
+ %9 = insertelement <16 x i8> poison, i8 %2, i64 0, !dbg !18
+ %10 = shufflevector <16 x i8> %9, <16 x i8> poison, <16 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, !dbg !18
+ %11 = add <16 x i8> %10, <i8 poison, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, !dbg !18
+ %12 = extractelement <16 x i8> %11, i64 1, !dbg !21
+ %13 = extractelement <16 x i8> %11, i64 2, !dbg !21
+ %14 = extractelement <16 x i8> %11, i64 3, !dbg !21
+ %15 = extractelement <16 x i8> %11, i64 4, !dbg !21
+ %16 = extractelement <16 x i8> %11, i64 5, !dbg !21
+ %17 = extractelement <16 x i8> %11, i64 6, !dbg !21
+ %18 = extractelement <16 x i8> %11, i64 7, !dbg !21
+ %19 = extractelement <16 x i8> %11, i64 8, !dbg !21
+ %20 = extractelement <16 x i8> %11, i64 9, !dbg !21
+ %21 = extractelement <16 x i8> %11, i64 10, !dbg !21
+ %22 = extractelement <16 x i8> %11, i64 11, !dbg !21
+ %23 = extractelement <16 x i8> %11, i64 12, !dbg !21
+ %24 = extractelement <16 x i8> %11, i64 13, !dbg !21
+ %25 = extractelement <16 x i8> %11, i64 14, !dbg !21
+ %26 = extractelement <16 x i8> %11, i64 15, !dbg !21
+ store i8 %2, ptr %0, align 1, !dbg !26
+ %27 = getelementptr inbounds i8, ptr %0, i64 1, !dbg !26
+ store i8 %12, ptr %27, align 1, !dbg !26
+ %28 = getelementptr inbounds i8, ptr %0, i64 2, !dbg !26
+ store i8 %13, ptr %28, align 1, !dbg !26
+ %29 = getelementptr inbounds i8, ptr %0, i64 3, !dbg !26
+ store i8 %14, ptr %29, align 1, !dbg !26
+ %30 = getelementptr inbounds i8, ptr %0, i64 4, !dbg !26
+ store i8 %15, ptr %30, align 1, !dbg !26
+ %31 = getelementptr inbounds i8, ptr %0, i64 5, !dbg !26
+ store i8 %16, ptr %31, align 1, !dbg !26
+ %32 = getelementptr inbounds i8, ptr %0, i64 6, !dbg !26
+ store i8 %17, ptr %32, align 1, !dbg !26
+ %33 = getelementptr inbounds i8, ptr %0, i64 7, !dbg !26
+ store i8 %18, ptr %33, align 1, !dbg !26
+ %34 = getelementptr inbounds i8, ptr %0, i64 8, !dbg !26
+ store i8 %19, ptr %34, align 1, !dbg !26
+ %35 = getelementptr inbounds i8, ptr %0, i64 9, !dbg !26
+ store i8 %20, ptr %35, align 1, !dbg !26
+ %36 = getelementptr inbounds i8, ptr %0, i64 10, !dbg !26
+ store i8 %21, ptr %36, align 1, !dbg !26
+ %37 = getelementptr inbounds i8, ptr %0, i64 11, !dbg !26
+ store i8 %22, ptr %37, align 1, !dbg !26
+ %38 = getelementptr inbounds i8, ptr %0, i64 12, !dbg !26
+ store i8 %23, ptr %38, align 1, !dbg !26
+ %39 = getelementptr inbounds i8, ptr %0, i64 13, !dbg !26
+ store i8 %24, ptr %39, align 1, !dbg !26
+ %40 = getelementptr inbounds i8, ptr %0, i64 14, !dbg !26
+ store i8 %25, ptr %40, align 1, !dbg !26
+ %41 = getelementptr inbounds i8, ptr %0, i64 15, !dbg !26
+ store i8 %26, ptr %41, align 1, !dbg !26
+ ret void, !dbg !26
}
Notice how extractelement <16 x i8> %v, i16 0 in the 1-argument version got removed since it equals %start. This replaces store i8 %"v[0]", ptr %0, align 1 with store i8 %start, ptr %0, align 1. The SLP-Vectorizer cannot handle this properly since not all elements to be stored are from the same vector. Godbolt: Compiler Explorer. I opened an issue on LLVM.