Why does Julia not optimize this code when C++ (LLVM) can?

When using a C++ compiler with LLVM version 6.0.0, the following code

bool isEven(int n) {
    bool ret = true;
    for (int i = 0; i < n; i ++) {
        ret = !ret;
    }
    return ret;
}

emits the LLVM IR

define zeroext i1 @_Z6isEveni(i32) local_unnamed_addr #0 !dbg !7 {
  call void @llvm.dbg.value(metadata i32 %0, metadata !14, metadata !DIExpression()), !dbg !18
  call void @llvm.dbg.value(metadata i8 1, metadata !15, metadata !DIExpression()), !dbg !19
  call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !20
  %2 = icmp slt i32 %0, 1, !dbg !21
  %3 = and i32 %0, 1, !dbg !23
  %4 = icmp eq i32 %3, 0, !dbg !23
  %5 = or i1 %4, %2, !dbg !23
  ret i1 %5, !dbg !24
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

attributes #0 = { nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }

See: https://godbolt.org/z/oPBFey

This is functionally equivalent to the following implementation:

julia> isEven(n::Int) = rem(n, 2) != 0
isEven (generic function with 1 method)

julia> @code_llvm debuginfo=:none isEven(7)

define i8 @julia_isEven_18796(i64) {
top:
  %1 = trunc i64 %0 to i8
  %2 = and i8 %1, 1
  %3 = xor i8 %2, 1
  ret i8 %3
}

julia>

However, the original C++ implementation ported to Julia results in a very different LLVM IR:

julia> function isEven(n::Int)
           out = true
           for i in 0:n-1
               out = !out
           end
           return out
       end
isEven (generic function with 1 method)

julia> @code_llvm debuginfo=:none isEven(7)

define i8 @julia_isEven_18793(i64) {
top:
  %1 = add i64 %0, -1
  %2 = icmp sgt i64 %1, -1
  br i1 %2, label %L8.L12_crit_edge, label %L25

L8.L12_crit_edge:                                 ; preds = %top
  %min.iters.check = icmp ult i64 %0, 128
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L8.L12_crit_edge
  %n.vec = and i64 %0, -128
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <32 x i8> [ <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %vector.ph ], [ %3, %vector.body ]
  %vec.phi8 = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ]
  %vec.phi9 = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
  %vec.phi10 = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
  %3 = xor <32 x i8> %vec.phi, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  %4 = xor <32 x i8> %vec.phi8, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  %5 = xor <32 x i8> %vec.phi9, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  %6 = xor <32 x i8> %vec.phi10, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  %index.next = add i64 %index, 128
  %7 = icmp eq i64 %index.next, %n.vec
  br i1 %7, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
  %bin.rdx = xor <32 x i8> %vec.phi8, %vec.phi
  %bin.rdx14 = xor <32 x i8> %5, %bin.rdx
  %bin.rdx15 = xor <32 x i8> %6, %bin.rdx14
  %rdx.shuf = shufflevector <32 x i8> %bin.rdx15, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx16 = xor <32 x i8> %bin.rdx15, %rdx.shuf
  %rdx.shuf17 = shufflevector <32 x i8> %bin.rdx16, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx18 = xor <32 x i8> %bin.rdx16, %rdx.shuf17
  %rdx.shuf19 = shufflevector <32 x i8> %bin.rdx18, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx20 = xor <32 x i8> %bin.rdx18, %rdx.shuf19
  %rdx.shuf21 = shufflevector <32 x i8> %bin.rdx20, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx22 = xor <32 x i8> %bin.rdx20, %rdx.shuf21
  %rdx.shuf23 = shufflevector <32 x i8> %bin.rdx22, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx24 = xor <32 x i8> %bin.rdx22, %rdx.shuf23
  %8 = extractelement <32 x i8> %bin.rdx24, i32 0
  %cmp.n = icmp eq i64 %n.vec, %0
  br i1 %cmp.n, label %L25, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %L8.L12_crit_edge
  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L8.L12_crit_edge ]
  %bc.merge.rdx = phi i8 [ %8, %middle.block ], [ 1, %L8.L12_crit_edge ]
  br label %L12

L12:                                              ; preds = %L12, %scalar.ph
  %value_phi2 = phi i8 [ %bc.merge.rdx, %scalar.ph ], [ %9, %L12 ]
  %value_phi3 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %11, %L12 ]
  %9 = xor i8 %value_phi2, 1
  %10 = icmp eq i64 %value_phi3, %1
  %11 = add i64 %value_phi3, 1
  br i1 %10, label %L25, label %L12

L25:                                              ; preds = %L12, %middle.block, %top
  %value_phi6 = phi i8 [ 1, %top ], [ %9, %L12 ], [ %8, %middle.block ]
  ret i8 %value_phi6
}


julia> versioninfo()
Julia Version 1.3.1
Commit 2d5741174c (2019-12-30 21:36 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin18.6.0)
  CPU: Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.1 (ORCJIT, skylake)

julia>

Can anyone explain why Julia is not able to produce the same IR as a C++ compiler for essentially the same code with almost the same version of LLVM?

I’ve reposted this question from SO.

1 Like

It looks like it’s the opposite. The Julia LLVM IR is unrolling it and using SIMD, and is working on 32 elements at a time.

The point is that determining whether a number is even doesn’t necessarily require a loop, and clang is able to figure that out given naive code that does use a loop.

I see, that makes sense.

This kind optimizations are almost always done with pattern matching so it basically comes down to what exactly is the pattern that llvm is matching. Minor things like how exactly the ir is generated and pass order (which, granted, isn’t exactly minor…) can affect this kind of optimizations…

And clang should almost always work since the way it generate the ir would usually be the pattern that the passes matches…

3 Likes

I think it has to do with the fact that the UnitRange may be 0:-abs(n) or 0:abs(n). So ensuring that it is always positive makes this generate LLVM IR closer to C++.

julia> function isEven(n::Int)
           out = true
           for _ in 0:abs(n)
               out = !out
           end
           return !out
           end
isEven (generic function with 1 method)

julia> @code_llvm debuginfo=:none isEven(7)

define i8 @julia_isEven_17367(i64) {
top:
  %1 = icmp slt i64 %0, 0
  %2 = sub i64 0, %0
  %3 = select i1 %1, i64 %2, i64 %0
  %4 = icmp sgt i64 %3, -1
  br i1 %4, label %L8.L12_crit_edge, label %L25

L8.L12_crit_edge:                                 ; preds = %top
  br label %L12

L12:                                              ; preds = %L12, %L8.L12_crit_edge
  %value_phi2 = phi i8 [ 1, %L8.L12_crit_edge ], [ %5, %L12 ]
  %value_phi3 = phi i64 [ 0, %L8.L12_crit_edge ], [ %7, %L12 ]
  %5 = xor i8 %value_phi2, 1
  %6 = icmp eq i64 %value_phi3, %3
  %7 = add i64 %value_phi3, 1
  br i1 %6, label %L25, label %L12

L25:                                              ; preds = %L12, %top
  %value_phi6 = phi i8 [ 0, %top ], [ %value_phi2, %L12 ]
  ret i8 %value_phi6
}

julia>

Well that is arguably worse code. Depending on if you have confused the vectorizer or if you make it think the scalar loop is faster.

In another word, that code is in no way closer to the clang one. The code length tells you absolutely nothing here.

Ohh I see what is going on, you are right. I was looking at just part of the contents of block L12 and I thought it eliminated the loop, but it’s just eliminated the vectorized version of the loop (?) and block L12 still can branch into itself. I ran benchmarks and isEven is significantly slower than the built in iseven, and the time for calculating isEven changes depending on the value of n but iseven is constant time. So we are back to square one :slight_smile:.

Now, I don’t think this is a case that’s worth too much to look into since it’s not like you have to use a loop to express this in the first place (in contrast to some bit manipulating code that has no “standard” high level discription in C). If you really want to look into this, the way is to get the unoptimized IR from both julia and C and run the optimizers on them to see what’s the difference and which pass did the transformation. You can then figure out if this is a pass ordering problem or a IR generation problem and what order or what IR difference is the issue.

IIRC, for another bit manipulating one I’ve looked into before, the issue was the loop variable (difference in behavior on overflow)

2 Likes

Thanks for the answer. I’m interested in looking into it further just to learn how all of this works. Is this what you mean by get the unoptimized IR from julia and C++?

C++ ( with flags -O0 -emit-llvm)


define zeroext i1 @_Z6isEvenl(i64) #0 !dbg !7 {
  %2 = alloca i64, align 8
  %3 = alloca i8, align 1
  %4 = alloca i64, align 8
  store i64 %0, i64* %2, align 8
  call void @llvm.dbg.declare(metadata i64* %2, metadata !13, metadata !DIExpression()), !dbg !14
  call void @llvm.dbg.declare(metadata i8* %3, metadata !15, metadata !DIExpression()), !dbg !16
  store i8 1, i8* %3, align 1, !dbg !16
  call void @llvm.dbg.declare(metadata i64* %4, metadata !17, metadata !DIExpression()), !dbg !19
  store i64 0, i64* %4, align 8, !dbg !19
  br label %5, !dbg !20

  %6 = load i64, i64* %4, align 8, !dbg !21
  %7 = load i64, i64* %2, align 8, !dbg !23
  %8 = icmp slt i64 %6, %7, !dbg !24
  br i1 %8, label %9, label %17, !dbg !25

  %10 = load i8, i8* %3, align 1, !dbg !26
  %11 = trunc i8 %10 to i1, !dbg !26
  %12 = xor i1 %11, true, !dbg !28
  %13 = zext i1 %12 to i8, !dbg !29
  store i8 %13, i8* %3, align 1, !dbg !29
  br label %14, !dbg !30

  %15 = load i64, i64* %4, align 8, !dbg !31
  %16 = add nsw i64 %15, 1, !dbg !31
  store i64 %16, i64* %4, align 8, !dbg !31
  br label %5, !dbg !32, !llvm.loop !33

  %18 = load i8, i8* %3, align 1, !dbg !35
  %19 = trunc i8 %18 to i1, !dbg !35
  ret i1 %19, !dbg !36
}

declare void @llvm.dbg.declare(metadata, metadata, metadata) #1

attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }

I even changed the C++ code to use long to a closer comparison.

And julia julia -O0:


define i8 @julia_isEven_17217(i64) {
top:
  %1 = call %jl_value_t*** inttoptr (i64 4349030448 to %jl_value_t*** ()*)() #3
  %2 = bitcast %jl_value_t*** %1 to %jl_value_t addrspace(10)**
  %3 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %2, i64 4
  %4 = bitcast %jl_value_t addrspace(10)** %3 to i64**
  %5 = load i64*, i64** %4
  %6 = sub i64 %0, 1
  %7 = icmp sle i64 0, %6
  %8 = zext i1 %7 to i8
  %9 = trunc i8 %8 to i1
  %10 = xor i1 %9, true
  %11 = select i1 %10, i64 -1, i64 %6
  %12 = icmp slt i64 %11, 0
  %13 = zext i1 %12 to i8
  %14 = trunc i8 %13 to i1
  %15 = xor i1 %14, true
  %. = select i1 %15, i8 0, i8 1
  %.7 = select i1 %15, i64 0, i64 undef
  %16 = xor i8 %., 1
  %17 = trunc i8 %16 to i1
  %18 = xor i1 %17, true
  br i1 %18, label %L25, label %L8.L12_crit_edge

L8.L12_crit_edge:                                 ; preds = %top
  br label %L12

L12:                                              ; preds = %L8.L12_crit_edge, %L24
  %value_phi2 = phi i8 [ 1, %L8.L12_crit_edge ], [ %19, %L24 ]
  %value_phi3 = phi i64 [ %.7, %L8.L12_crit_edge ], [ %value_phi4, %L24 ]
  %19 = xor i8 %value_phi2, 1
  %20 = icmp eq i64 %value_phi3, %11
  %21 = zext i1 %20 to i8
  %22 = trunc i8 %21 to i1
  %23 = xor i1 %22, true
  %24 = add i64 %value_phi3, 1
  %value_phi4 = select i1 %23, i64 %24, i64 undef
  %value_phi5 = select i1 %23, i8 0, i8 1
  %25 = xor i8 %value_phi5, 1
  %26 = trunc i8 %25 to i1
  %27 = xor i1 %26, true
  br i1 %27, label %L25, label %L24

L24:                                              ; preds = %L12
  br label %L12

L25:                                              ; preds = %L12, %top
  %value_phi6 = phi i8 [ 1, %top ], [ %19, %L12 ]
  ret i8 %value_phi6
}

julia>

The easiest way to get unoptimized IR in Julia is @code_llvm optimize=false foo(args...).

julia> function isEven(n::Int)
                  out = true
                  for i in 0:n-1
                      out = !out
                  end
                  return out
              end
isEven (generic function with 1 method)

julia> @code_llvm optimize=false debuginfo=:none isEven(5)

define i8 @julia_isEven_17528(i64) {
top:
  %1 = call %jl_value_t*** @julia.ptls_states()
  %2 = bitcast %jl_value_t*** %1 to %jl_value_t addrspace(10)**
  %3 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %2, i64 4
  %4 = bitcast %jl_value_t addrspace(10)** %3 to i64**
  %5 = load i64*, i64** %4
  %6 = sub i64 %0, 1
  %7 = icmp sle i64 0, %6
  %8 = zext i1 %7 to i8
  %9 = trunc i8 %8 to i1
  %10 = xor i1 %9, true
  %11 = select i1 %10, i64 -1, i64 %6
  %12 = icmp slt i64 %11, 0
  %13 = zext i1 %12 to i8
  %14 = trunc i8 %13 to i1
  %15 = xor i1 %14, true
  br i1 %15, label %L7, label %L6

L6:                                               ; preds = %top
  br label %L8

L7:                                               ; preds = %top
  br label %L8

L8:                                               ; preds = %L7, %L6
  %value_phi = phi i8 [ 1, %L6 ], [ 0, %L7 ]
  %value_phi1 = phi i64 [ 0, %L7 ], [ undef, %L6 ]
  %16 = xor i8 %value_phi, 1
  %17 = trunc i8 %16 to i1
  %18 = xor i1 %17, true
  br i1 %18, label %L8.L25_crit_edge, label %L8.L12_crit_edge

L8.L25_crit_edge:                                 ; preds = %L8
  br label %L25

L8.L12_crit_edge:                                 ; preds = %L8
  br label %L12

L12:                                              ; preds = %L8.L12_crit_edge, %L24
  %value_phi2 = phi i8 [ 1, %L8.L12_crit_edge ], [ %19, %L24 ]
  %value_phi3 = phi i64 [ %value_phi1, %L8.L12_crit_edge ], [ %value_phi4, %L24 ]
  %19 = xor i8 %value_phi2, 1
  %20 = icmp eq i64 %value_phi3, %11
  %21 = zext i1 %20 to i8
  %22 = trunc i8 %21 to i1
  %23 = xor i1 %22, true
  br i1 %23, label %L18, label %L17

L17:                                              ; preds = %L12
  br label %L20

L18:                                              ; preds = %L12
  %24 = add i64 %value_phi3, 1
  br label %L20

L20:                                              ; preds = %L18, %L17
  %value_phi4 = phi i64 [ %24, %L18 ], [ undef, %L17 ]
  %value_phi5 = phi i8 [ 1, %L17 ], [ 0, %L18 ]
  %25 = xor i8 %value_phi5, 1
  %26 = trunc i8 %25 to i1
  %27 = xor i1 %26, true
  br i1 %27, label %L20.L25_crit_edge, label %L24

L20.L25_crit_edge:                                ; preds = %L20
  br label %L25

L24:                                              ; preds = %L20
  br label %L12

L25:                                              ; preds = %L8.L25_crit_edge, %L20.L25_crit_edge
  %value_phi6 = phi i8 [ %19, %L20.L25_crit_edge ], [ 1, %L8.L25_crit_edge ]
  ret i8 %value_phi6
}
2 Likes

Correct. (and see @Elrod for how to generate the correct julia one). We do have a few of our own passes that needs to be run before LLVM can fully optimize things but I think in this case those shouldn’t really matter (you’ll get some junk at the beginning of your function after optimization but that shouldn’t really affect he transformation of the loop).

You should run these through opt (with cmdline argument that I don’t remember = = …) to see if things are optimized.

For the C++ one, before you run it, make sure to remove the optnone flag. It’ll turn off all optimizations on that function.

Thanks for the tips!

For the C++ one, before you run it, make sure to remove the optnone flag. It’ll turn off all optimizations on that function.

The with and without -O0 seemed to generate the same IR on godbolt.org.

Too lazy to figure out what the default optimization flag is (iirc it is O0 but I’m not sure). In any case just search for optnone in the ir and you’ll see it. It’s in the one paste above as well.