Fancy LLVM loop transformation pass that deoptimize the code

Recently we’ve seen one strange performance regression report, and the following is a MWE we can get:

using BenchmarkTools

function rand_n_1(a,n)
    r = Vector{UInt}(undef,n)
    @inbounds for i in 1:n
        p = a[1] - a[2]
        a[1] = a[2]
        a[2] = a[3]
        a[3] = p
        r[i] = p
    end
    return r
end

function rand_n_2(a,n)
    r = Vector{UInt}(undef,n)
    @inbounds for i in 1:n
        p = a[1] - a[2]
        a[1] = a[2] * 2
        a[2] = a[3] * 3
        a[3] = p * 4
        r[i] = p
    end
    return r
end

a = UInt[1, 2, 3]

# 43.626 ms (3 allocations: 76.30 MiB)
@btime rand_n_1(a, 10^7);
# 20.388 ms (3 allocations: 76.30 MiB)
@btime rand_n_2(a, 10^7);

The rand_n_2 is a baseline version, and I expect that rand_n_1 is at least as fast as rand_n_2. But surprisingly, it isn’t.

By checking the LLVM IR, the slow version introduces an instead overly smart optimization pass (unroll and vectorize the loop), while the fast version is more or less scalar.

Although we’ve found ways to rewrite the function structure so that Julia/llvm fails to recognize the pattern and thus works around the performance regression, I’d like to know the suggested fix for such issues since both versions look good.

@code_llvm rand_n_1(a, 10^7)
; Function Signature: rand_n_1(Array{UInt64, 1}, Int64)
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:3 within `rand_n_1`
define nonnull ptr @julia_rand_n_1_2870(ptr noundef nonnull align 8 dereferenceable(24) %"a::Array", i64 signext %"n::Int64") #0 {
top:
  %gcframe1 = alloca [3 x ptr], align 16
  call void @llvm.memset.p0.i64(ptr align 16 %gcframe1, i8 0, i64 24, i1 true)
  %thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #12
  %tls_ppgcstack = getelementptr inbounds i8, ptr %thread_ptr, i64 -8
  %tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
  store i64 4, ptr %gcframe1, align 8
  %frame.prev = getelementptr inbounds ptr, ptr %gcframe1, i64 1
  %task.gcstack = load ptr, ptr %tls_pgcstack, align 8
  store ptr %task.gcstack, ptr %frame.prev, align 8
  store ptr %gcframe1, ptr %tls_pgcstack, align 8
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:4 within `rand_n_1`
; β”Œ @ boot.jl:647 within `Array`
; β”‚β”Œ @ boot.jl:588 within `GenericMemory`
    %memorynew_empty = icmp eq i64 %"n::Int64", 0
    br i1 %memorynew_empty, label %retval, label %nonemptymem

L21:                                              ; preds = %L21.preheader.new, %L21
    %value_phi5 = phi i64 [ 1, %L21.preheader.new ], [ %16, %L21 ]
    %niter = phi i64 [ 0, %L21.preheader.new ], [ %niter.next.3, %L21 ]
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:6 within `rand_n_1`
; β”Œ @ essentials.jl:918 within `getindex`
   %0 = load i64, ptr %memoryref_data, align 8
   %1 = load <2 x i64>, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %2 = extractelement <2 x i64> %1, i64 0
   %3 = sub i64 %0, %2
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:7 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store <2 x i64> %1, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:9 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %3, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:10 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep = getelementptr i64, ptr %invariant.gep, i64 %value_phi5
    store i64 %3, ptr %gep, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:6 within `rand_n_1`
; β”Œ @ essentials.jl:918 within `getindex`
   %4 = load i64, ptr %memoryref_data, align 8
   %5 = load <2 x i64>, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %6 = extractelement <2 x i64> %5, i64 0
   %7 = sub i64 %4, %6
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:7 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store <2 x i64> %5, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:9 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %7, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:10 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.1 = getelementptr i64, ptr %memory_data, i64 %value_phi5
    store i64 %7, ptr %gep.1, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:6 within `rand_n_1`
; β”Œ @ essentials.jl:918 within `getindex`
   %8 = load i64, ptr %memoryref_data, align 8
   %9 = load <2 x i64>, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %10 = extractelement <2 x i64> %9, i64 0
   %11 = sub i64 %8, %10
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:7 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store <2 x i64> %9, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:9 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %11, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:10 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.2 = getelementptr i64, ptr %gep, i64 2
    store i64 %11, ptr %gep.2, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:6 within `rand_n_1`
; β”Œ @ essentials.jl:918 within `getindex`
   %12 = load i64, ptr %memoryref_data, align 8
   %13 = load <2 x i64>, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %14 = extractelement <2 x i64> %13, i64 0
   %15 = sub i64 %12, %14
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:7 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store <2 x i64> %13, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:9 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %15, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:10 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.3 = getelementptr i64, ptr %gep, i64 3
    store i64 %15, ptr %gep.3, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:11 within `rand_n_1`
; β”Œ @ range.jl:921 within `iterate`
   %16 = add i64 %value_phi5, 4
; β””
  %niter.next.3 = add i64 %niter, 4
  %niter.ncmp.3 = icmp eq i64 %niter.next.3, %unroll_iter
  br i1 %niter.ncmp.3, label %L183.loopexit.unr-lcssa, label %L21

L183.loopexit.unr-lcssa:                          ; preds = %L21.preheader, %L21
  %value_phi5.unr = phi i64 [ 1, %L21.preheader ], [ %16, %L21 ]
  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
  br i1 %lcmp.mod.not, label %L183, label %L21.epil

L21.epil:                                         ; preds = %L21.epil, %L183.loopexit.unr-lcssa
  %value_phi5.epil = phi i64 [ %21, %L21.epil ], [ %value_phi5.unr, %L183.loopexit.unr-lcssa ]
  %epil.iter = phi i64 [ %epil.iter.next, %L21.epil ], [ 0, %L183.loopexit.unr-lcssa ]
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:6 within `rand_n_1`
; β”Œ @ essentials.jl:918 within `getindex`
   %17 = load i64, ptr %memoryref_data, align 8
   %18 = load <2 x i64>, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %19 = extractelement <2 x i64> %18, i64 0
   %20 = sub i64 %17, %19
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:7 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store <2 x i64> %18, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:9 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %20, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:10 within `rand_n_1`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.epil = getelementptr i64, ptr %invariant.gep, i64 %value_phi5.epil
    store i64 %20, ptr %gep.epil, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:11 within `rand_n_1`
; β”Œ @ range.jl:921 within `iterate`
   %21 = add i64 %value_phi5.epil, 1
; β””
  %epil.iter.next = add i64 %epil.iter, 1
  %epil.iter.cmp.not = icmp eq i64 %epil.iter.next, %xtraiter
  br i1 %epil.iter.cmp.not, label %L183, label %L21.epil

L183:                                             ; preds = %retval, %L21.epil, %L183.loopexit.unr-lcssa
  %frame.prev110 = load ptr, ptr %frame.prev, align 8
  store ptr %frame.prev110, ptr %tls_pgcstack, align 8
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:12 within `rand_n_1`
  ret ptr %"new::Array"

nonemptymem:                                      ; preds = %top
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:4 within `rand_n_1`
; β”Œ @ boot.jl:647 within `Array`
; β”‚β”Œ @ boot.jl:588 within `GenericMemory`
    %22 = icmp ult i64 %"n::Int64", 1152921504606846976
    br i1 %22, label %pass, label %fail

fail:                                             ; preds = %nonemptymem
    call void @jl_argument_error(ptr nonnull @"_j_str_invalid GenericMemory siz...#1")
    unreachable

pass:                                             ; preds = %nonemptymem
    %23 = shl nuw nsw i64 %"n::Int64", 3
    %ptls_field = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16
    %ptls_load = load ptr, ptr %ptls_field, align 8
    %"Memory{UInt64}[]" = call noalias nonnull align 16 ptr @jl_alloc_genericmemory_unchecked(ptr %ptls_load, i64 %23, ptr nonnull @"+Core.GenericMemory#2873.jit") #13
    store i64 %"n::Int64", ptr %"Memory{UInt64}[]", align 8
    br label %retval

retval:                                           ; preds = %pass, %top
    %memoryref_mem81 = phi ptr [ %"Memory{UInt64}[]", %pass ], [ @"jl_global#2872.jit", %top ]
; β”‚β””
; β”‚ @ boot.jl:648 within `Array`
; β”‚β”Œ @ boot.jl:593 within `memoryref`
    %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem81, i64 0, i32 1
    %memory_data = load ptr, ptr %memory_data_ptr, align 8
    %gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe1, i64 2
    store ptr %memoryref_mem81, ptr %gc_slot_addr_0, align 8
; β”‚β””
   %ptls_field108 = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16
   %ptls_load109 = load ptr, ptr %ptls_field108, align 8
   %"new::Array" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load109, i32 408, i32 32, i64 140686563959504) #8
   %"new::Array.tag_addr" = getelementptr inbounds i64, ptr %"new::Array", i64 -1
   store atomic i64 140686563959504, ptr %"new::Array.tag_addr" unordered, align 8
   %24 = getelementptr inbounds i8, ptr %"new::Array", i64 8
   store ptr %memory_data, ptr %"new::Array", align 8
   store ptr %memoryref_mem81, ptr %24, align 8
   %"new::Array.size_ptr" = getelementptr inbounds i8, ptr %"new::Array", i64 16
   store i64 %"n::Int64", ptr %"new::Array.size_ptr", align 8
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:5 within `rand_n_1`
  br i1 %memorynew_empty, label %L183, label %L21.preheader

L21.preheader:                                    ; preds = %retval
  %memoryref_data = load ptr, ptr %"a::Array", align 8
  %memoryref_data19 = getelementptr inbounds i64, ptr %memoryref_data, i64 1
  %memoryref_data52 = getelementptr inbounds i64, ptr %memoryref_data, i64 2
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:11 within `rand_n_1`
  %invariant.gep = getelementptr i64, ptr %memory_data, i64 -1
  %xtraiter = and i64 %"n::Int64", 3
  %25 = icmp ult i64 %"n::Int64", 4
  br i1 %25, label %L183.loopexit.unr-lcssa, label %L21.preheader.new

L21.preheader.new:                                ; preds = %L21.preheader
  %unroll_iter = and i64 %"n::Int64", -4
  br label %L21
}
@code_llvm rand_n_2(a, 10^7)
; Function Signature: rand_n_2(Array{UInt64, 1}, Int64)
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:15 within `rand_n_2`
define nonnull ptr @julia_rand_n_2_2903(ptr noundef nonnull align 8 dereferenceable(24) %"a::Array", i64 signext %"n::Int64") #0 {
top:
  %gcframe1 = alloca [3 x ptr], align 16
  call void @llvm.memset.p0.i64(ptr align 16 %gcframe1, i8 0, i64 24, i1 true)
  %thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #12
  %tls_ppgcstack = getelementptr inbounds i8, ptr %thread_ptr, i64 -8
  %tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
  store i64 4, ptr %gcframe1, align 8
  %frame.prev = getelementptr inbounds ptr, ptr %gcframe1, i64 1
  %task.gcstack = load ptr, ptr %tls_pgcstack, align 8
  store ptr %task.gcstack, ptr %frame.prev, align 8
  store ptr %gcframe1, ptr %tls_pgcstack, align 8
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:16 within `rand_n_2`
; β”Œ @ boot.jl:647 within `Array`
; β”‚β”Œ @ boot.jl:588 within `GenericMemory`
    %memorynew_empty = icmp eq i64 %"n::Int64", 0
    br i1 %memorynew_empty, label %retval, label %nonemptymem

L21:                                              ; preds = %L21.preheader.new, %L21
    %value_phi5 = phi i64 [ 1, %L21.preheader.new ], [ %28, %L21 ]
    %niter = phi i64 [ 0, %L21.preheader.new ], [ %niter.next.3, %L21 ]
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:18 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %0 = load i64, ptr %memoryref_data, align 8
   %1 = load i64, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %2 = sub i64 %0, %1
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:19 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %3 = shl i64 %1, 1
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %3, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:20 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %4 = load i64, ptr %memoryref_data52, align 8
; β””
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %5 = mul i64 %4, 3
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %5, ptr %memoryref_data19, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:21 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %6 = shl i64 %2, 2
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %6, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:22 within `rand_n_2`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep = getelementptr i64, ptr %invariant.gep, i64 %value_phi5
    store i64 %2, ptr %gep, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:18 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %7 = load i64, ptr %memoryref_data, align 8
   %8 = load i64, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %9 = sub i64 %7, %8
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:19 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %10 = shl i64 %8, 1
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %10, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:20 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %11 = load i64, ptr %memoryref_data52, align 8
; β””
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %12 = mul i64 %11, 3
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %12, ptr %memoryref_data19, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:21 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %13 = shl i64 %9, 2
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %13, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:22 within `rand_n_2`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.1 = getelementptr i64, ptr %memory_data, i64 %value_phi5
    store i64 %9, ptr %gep.1, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:18 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %14 = load i64, ptr %memoryref_data, align 8
   %15 = load i64, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %16 = sub i64 %14, %15
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:19 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %17 = shl i64 %15, 1
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %17, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:20 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %18 = load i64, ptr %memoryref_data52, align 8
; β””
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %19 = mul i64 %18, 3
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %19, ptr %memoryref_data19, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:21 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %20 = shl i64 %16, 2
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %20, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:22 within `rand_n_2`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.2 = getelementptr i64, ptr %gep, i64 2
    store i64 %16, ptr %gep.2, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:18 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %21 = load i64, ptr %memoryref_data, align 8
   %22 = load i64, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %23 = sub i64 %21, %22
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:19 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %24 = shl i64 %22, 1
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %24, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:20 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %25 = load i64, ptr %memoryref_data52, align 8
; β””
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %26 = mul i64 %25, 3
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %26, ptr %memoryref_data19, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:21 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %27 = shl i64 %23, 2
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %27, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:22 within `rand_n_2`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.3 = getelementptr i64, ptr %gep, i64 3
    store i64 %23, ptr %gep.3, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:23 within `rand_n_2`
; β”Œ @ range.jl:921 within `iterate`
   %28 = add i64 %value_phi5, 4
; β””
  %niter.next.3 = add i64 %niter, 4
  %niter.ncmp.3 = icmp eq i64 %niter.next.3, %unroll_iter
  br i1 %niter.ncmp.3, label %L186.loopexit.unr-lcssa, label %L21

L186.loopexit.unr-lcssa:                          ; preds = %L21.preheader, %L21
  %value_phi5.unr = phi i64 [ 1, %L21.preheader ], [ %28, %L21 ]
  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
  br i1 %lcmp.mod.not, label %L186, label %L21.epil

L21.epil:                                         ; preds = %L21.epil, %L186.loopexit.unr-lcssa
  %value_phi5.epil = phi i64 [ %36, %L21.epil ], [ %value_phi5.unr, %L186.loopexit.unr-lcssa ]
  %epil.iter = phi i64 [ %epil.iter.next, %L21.epil ], [ 0, %L186.loopexit.unr-lcssa ]
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:18 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %29 = load i64, ptr %memoryref_data, align 8
   %30 = load i64, ptr %memoryref_data19, align 8
; β””
; β”Œ @ int.jl:86 within `-`
   %31 = sub i64 %29, %30
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:19 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %32 = shl i64 %30, 1
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %32, ptr %memoryref_data, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:20 within `rand_n_2`
; β”Œ @ essentials.jl:918 within `getindex`
   %33 = load i64, ptr %memoryref_data52, align 8
; β””
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %34 = mul i64 %33, 3
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %34, ptr %memoryref_data19, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:21 within `rand_n_2`
; β”Œ @ int.jl:863 within `*` @ int.jl:88
   %35 = shl i64 %31, 2
; β””
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    store i64 %35, ptr %memoryref_data52, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:22 within `rand_n_2`
; β”Œ @ array.jl:986 within `setindex!`
; β”‚β”Œ @ array.jl:991 within `_setindex!`
    %gep.epil = getelementptr i64, ptr %invariant.gep, i64 %value_phi5.epil
    store i64 %31, ptr %gep.epil, align 8
; β””β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:23 within `rand_n_2`
; β”Œ @ range.jl:921 within `iterate`
   %36 = add i64 %value_phi5.epil, 1
; β””
  %epil.iter.next = add i64 %epil.iter, 1
  %epil.iter.cmp.not = icmp eq i64 %epil.iter.next, %xtraiter
  br i1 %epil.iter.cmp.not, label %L186, label %L21.epil

L186:                                             ; preds = %retval, %L21.epil, %L186.loopexit.unr-lcssa
  %frame.prev110 = load ptr, ptr %frame.prev, align 8
  store ptr %frame.prev110, ptr %tls_pgcstack, align 8
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:24 within `rand_n_2`
  ret ptr %"new::Array"

nonemptymem:                                      ; preds = %top
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:16 within `rand_n_2`
; β”Œ @ boot.jl:647 within `Array`
; β”‚β”Œ @ boot.jl:588 within `GenericMemory`
    %37 = icmp ult i64 %"n::Int64", 1152921504606846976
    br i1 %37, label %pass, label %fail

fail:                                             ; preds = %nonemptymem
    call void @jl_argument_error(ptr nonnull @"_j_str_invalid GenericMemory siz...#1")
    unreachable

pass:                                             ; preds = %nonemptymem
    %38 = shl nuw nsw i64 %"n::Int64", 3
    %ptls_field = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16
    %ptls_load = load ptr, ptr %ptls_field, align 8
    %"Memory{UInt64}[]" = call noalias nonnull align 16 ptr @jl_alloc_genericmemory_unchecked(ptr %ptls_load, i64 %38, ptr nonnull @"+Core.GenericMemory#2906.jit") #13
    store i64 %"n::Int64", ptr %"Memory{UInt64}[]", align 8
    br label %retval

retval:                                           ; preds = %pass, %top
    %memoryref_mem81 = phi ptr [ %"Memory{UInt64}[]", %pass ], [ @"jl_global#2905.jit", %top ]
; β”‚β””
; β”‚ @ boot.jl:648 within `Array`
; β”‚β”Œ @ boot.jl:593 within `memoryref`
    %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem81, i64 0, i32 1
    %memory_data = load ptr, ptr %memory_data_ptr, align 8
    %gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe1, i64 2
    store ptr %memoryref_mem81, ptr %gc_slot_addr_0, align 8
; β”‚β””
   %ptls_field108 = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16
   %ptls_load109 = load ptr, ptr %ptls_field108, align 8
   %"new::Array" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load109, i32 408, i32 32, i64 140686563959504) #8
   %"new::Array.tag_addr" = getelementptr inbounds i64, ptr %"new::Array", i64 -1
   store atomic i64 140686563959504, ptr %"new::Array.tag_addr" unordered, align 8
   %39 = getelementptr inbounds i8, ptr %"new::Array", i64 8
   store ptr %memory_data, ptr %"new::Array", align 8
   store ptr %memoryref_mem81, ptr %39, align 8
   %"new::Array.size_ptr" = getelementptr inbounds i8, ptr %"new::Array", i64 16
   store i64 %"n::Int64", ptr %"new::Array.size_ptr", align 8
; β””
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:17 within `rand_n_2`
  br i1 %memorynew_empty, label %L186, label %L21.preheader

L21.preheader:                                    ; preds = %retval
  %memoryref_data = load ptr, ptr %"a::Array", align 8
  %memoryref_data19 = getelementptr inbounds i64, ptr %memoryref_data, i64 1
  %memoryref_data52 = getelementptr inbounds i64, ptr %memoryref_data, i64 2
;  @ /home/chenjiuning/Documents/ai/LazyFFTW/example3.jl:23 within `rand_n_2`
  %invariant.gep = getelementptr i64, ptr %memory_data, i64 -1
  %xtraiter = and i64 %"n::Int64", 3
  %40 = icmp ult i64 %"n::Int64", 4
  br i1 %40, label %L186.loopexit.unr-lcssa, label %L21.preheader.new

L21.preheader.new:                                ; preds = %L21.preheader
  %unroll_iter = and i64 %"n::Int64", -4
  br label %L21
}
3 Likes

You can use Loopinfo to block vectorization or unrolling.

An even smaller reproducer is

julia> function foo1(a)
       @inbounds begin
       p = a[1] - a[2]
       a[1] = a[2]
       a[2] = a[3]
       a[3] = p
       end
       p
       end
foo1 (generic function with 1 method)

julia> function foo2(a)
       @inbounds begin
       p = a[1] - a[2]
       a[1] = a[2]*2
       a[2] = a[3]*3
       a[3] = p*4
       end
       p
       end
foo2 (generic function with 1 method)

julia> @btime foo1($a)
  3.343 ns (0 allocations: 0 bytes)
0x0000000000000000

julia> @btime foo2($a)
  1.933 ns (0 allocations: 0 bytes)
0x0000000000000000

So you see that this has nothing to do with loop transformation passes.

What I think is the issue is the following: The entire thing is limited on the latency of store-load forwarding. LLVM decides to vectorize the adjacent loads and stores, such that the store-load forwarding unit has to deal with partially overlapping loads/stores. Is there maybe a slow-path or lack of support for that?

That is a part of cpu performance manuals / design that I’m not overly acquainted with. A quick google found e.g. Auto-Vectorization and Store-to-Load-Forwarding | Emanuel’s Blog that discusses the problem in context of java C2.

So I’d say that this is an issue of machine code generation, not even llvm IR-level passes.

I mean, the obvious approach would be: Store-load forwarding sucks, even if it succeeds. You want to keep a in registers the whole time!

That may or may not apply to your real problem. If a is largish, then latency of store-load forwarding doesn’t matter; if a is small, then just put it into registers instead of memory.

PS. By β€œkeep it in registers” I mean the following code transformation:

julia> function rand_n_3(a,n)
           r = Vector{UInt}(undef,n)
           @inbounds begin a1 = a[1]; a2=a[2]; a3=a[3]; end
           @inbounds for i in 1:n
               p = a1 - a2
               a1 = a2
               a2 = a3
               a3 = p
               r[i] = p
           end
           @inbounds begin a[1]=a1; a[2]=a2;a[3]=a3; end
           return r
       end
rand_n_3 (generic function with 1 method)

julia> @btime rand_n_3(a, 10^7);
  5.832 ms (3 allocations: 76.30 MiB)

julia> @btime rand_n_1(a, 10^7);
  37.014 ms (3 allocations: 76.30 MiB)

PPS. This is on an older intel machine. I would not be overly shocked if some ARM or newer Zen had a store-load forwarding unit that can cope with this kind of partial overlap (but I wouldn’t be surprised to learn that they can’t deal with it either).

The bad code is:

julia> @code_native foo1(a)
	.text
	.file	"foo1"
	.section	.ltext,"axl",@progbits
	.globl	julia_foo1_4233                 # -- Begin function julia_foo1_4233
	.p2align	4, 0x90
	.type	julia_foo1_4233,@function
julia_foo1_4233:                        # @julia_foo1_4233
; Function Signature: foo1(Array{UInt64, 1})
; β”Œ @ REPL[11]:1 within `foo1`
# %bb.0:                                # %top
	#DEBUG_VALUE: foo1:a <- [$rdi+0]
	push	rbp
	mov	rbp, rsp
; β”‚ @ REPL[11]:3 within `foo1`
; β”‚β”Œ @ essentials.jl:918 within `getindex`
	mov	rcx, qword ptr [rdi]
	mov	rax, qword ptr [rcx]
; load1
	vmovups	xmm0, xmmword ptr [rcx + 8]
; β”‚β””
; β”‚β”Œ @ int.jl:86 within `-`
; load2
	sub	rax, qword ptr [rcx + 8]
; β”‚β””
; β”‚ @ REPL[11]:4 within `foo1`
; β”‚β”Œ @ array.jl:986 within `setindex!`
; β”‚β”‚β”Œ @ array.jl:991 within `_setindex!`
; store1 NOOO, don't do it! This store partially overlaps with load1 (and encompasses) load2. 
	vmovups	xmmword ptr [rcx], xmm0
; β”‚β””β””
; β”‚ @ REPL[11]:6 within `foo1`
; β”‚β”Œ @ array.jl:986 within `setindex!`
; β”‚β”‚β”Œ @ array.jl:991 within `_setindex!`
;store2 NOOO, this partially overlaps with load1
	mov	qword ptr [rcx + 16], rax
; β”‚β”‚β””
	pop	rbp
	ret
.Lfunc_end0:
	.size	julia_foo1_4233, .Lfunc_end0-julia_foo1_4233
; β””β””
                                        # -- End function
	.section	".note.GNU-stack","",@progbits

PPPS. The annoying fact is that the auto vectorization is a good thing if you don’t call foo repeatedly in a tight loop; but it messes up the internal state of the store-load forwarding unit, so that it becomes a pessimization if you call it repeatedly on the same a without enough delay for the writes to reach L1. In your example, llvm knows that it does this operation repeatedly on the same a, so there is no excuse for the bad codegen; but for non-inlined foo1 calls, it’s a trade-off.

6 Likes