Looping over float-range is faster than looping over int-range?

I came across a micro-benchmark online, with some weird performance characteristics.

Here is the function, benchmarking on MacOS, Julia version 1.0.0:

function g(N)
    s = 0.
    for k = 1:N+1
        s = s + (-1)^(k+1) / (2*k-1)
    end
    x = 4*s
    return x
end

and the benchmark results:

julia> N = 10^6
1000000

julia> n = 1e6
1.0e6

julia> @btime g($N)
  43.572 ms (0 allocations: 0 bytes)
3.1415936535887745

julia> @btime g($n)
  12.775 ms (0 allocations: 0 bytes)
3.1415936535887745

The code appears to be type-stable for both float and int input, but both the @code_warntype and @code_native output is much shorter for Int than for Float64. So why is the performance ‘reversed’?

When I benchmark the core loop, I’m no wiser:

k = 5
k_ = 5.0
s = 0.1
julia> @btime $s + (-1)^($k+1) / (2*$k-1)
  5.342 ns (0 allocations: 0 bytes)
0.2111111111111111

julia> @btime $s + (-1)^($k_+1) / (2*$k_-1)
  10.071 ns (0 allocations: 0 bytes)
0.2111111111111111

So the core calculation is faster with Ints than with Float64s, but with the whole function it’s the other way around.

What’s going on here?

I get similar performance for both here (Linux, build from source). Can you post the output of @code_llvm? Are you using the official binaries or a build from source?

Sure.

Int version:

julia> @code_llvm g(10^6)

; Function g
; Location: untitled-950019a7864aac9608a43902d7c30748:221
define double @julia_g_43138(i64) {
top:
; Location: untitled-950019a7864aac9608a43902d7c30748:222
; Function +; {
; Location: int.jl:53
  %1 = add i64 %0, 1
;}
; Function Colon; {
; Location: range.jl:5
; Function Type; {
; Location: range.jl:255
; Function unitrange_last; {
; Location: range.jl:260
; Function >=; {
; Location: operators.jl:333
; Function <=; {
; Location: int.jl:428
  %2 = icmp sgt i64 %1, 0
;}}}}}
  br i1 %2, label %L9.L14_crit_edge, label %L36

L9.L14_crit_edge:                                 ; preds = %top
  br label %L14

L14:                                              ; preds = %L14, %L9.L14_crit_edge
  %value_phi3 = phi double [ 0.000000e+00, %L9.L14_crit_edge ], [ %10, %L14 ]
  %value_phi4 = phi i64 [ 1, %L9.L14_crit_edge ], [ %3, %L14 ]
; Location: untitled-950019a7864aac9608a43902d7c30748:223
; Function +; {
; Location: int.jl:53
  %3 = add nuw i64 %value_phi4, 1
;}
; Function ^; {
; Location: intfuncs.jl:220
  %4 = call i64 @julia_power_by_squaring_25633(i64 -1, i64 %3)
;}
; Function *; {
; Location: int.jl:54
  %5 = shl i64 %value_phi4, 1
;}
; Function -; {
; Location: int.jl:52
  %6 = add i64 %5, -1
;}
; Function /; {
; Location: int.jl:59
; Function float; {
; Location: float.jl:269
; Function Type; {
; Location: float.jl:254
; Function Type; {
; Location: float.jl:60
  %7 = sitofp i64 %4 to double
  %8 = sitofp i64 %6 to double
;}}}
; Function /; {
; Location: float.jl:401
  %9 = fdiv double %7, %8
;}}
; Function +; {
; Location: float.jl:395
  %10 = fadd double %value_phi3, %9
;}
; Function iterate; {
; Location: range.jl:575
; Function ==; {
; Location: promotion.jl:425
  %11 = icmp eq i64 %value_phi4, %1
;}}
  br i1 %11, label %L36.loopexit, label %L14

L36.loopexit:                                     ; preds = %L14
; Location: untitled-950019a7864aac9608a43902d7c30748:225
; Function *; {
; Location: promotion.jl:314
; Function *; {
; Location: float.jl:399
  %phitmp = fmul double %10, 4.000000e+00
  br label %L36

L36:                                              ; preds = %L36.loopexit, %top
  %value_phi9 = phi double [ 0.000000e+00, %top ], [ %phitmp, %L36.loopexit ]
;}}
; Location: untitled-950019a7864aac9608a43902d7c30748:226
  ret double %value_phi9
}

Float version:

julia> @code_llvm g(1e6)

; Function g
; Location: untitled-950019a7864aac9608a43902d7c30748:221
define double @julia_g_43161(double) {
top:
  %1 = alloca { { double, double }, { double, double }, i64, i64 }, align 8
; Location: untitled-950019a7864aac9608a43902d7c30748:222
; Function +; {
; Location: promotion.jl:313
; Function +; {
; Location: float.jl:395
  %2 = fadd double %0, 1.000000e+00
;}}
; Function Colon; {
; Location: range.jl:3
; Function Colon; {
; Location: range.jl:14
  call void @julia_Colon_41837({ { double, double }, { double, double }, i64, i64 }* noalias nocapture nonnull sret %1, double 1.000000e+00, double 1.000000e+00, double %2)
;}}
; Function iterate; {
; Location: range.jl:567
; Function iterate; {
; Location: range.jl:567
; Function length; {
; Location: range.jl:498
; Function getproperty; {
; Location: sysimg.jl:18
  %3 = getelementptr inbounds { { double, double }, { double, double }, i64, i64 }, { { double, double }, { double, double }, i64, i64 }* %1, i64 0, i32 2
;}}
; Function <; {
; Location: int.jl:49
  %4 = load i64, i64* %3, align 8
  %5 = icmp sgt i64 %4, 0
;}
  br i1 %5, label %L40.L42_crit_edge, label %L101

L40.L42_crit_edge:                                ; preds = %top
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:445
; Function getproperty; {
; Location: sysimg.jl:18
  %6 = getelementptr inbounds { { double, double }, { double, double }, i64, i64 }, { { double, double }, { double, double }, i64, i64 }* %1, i64 0, i32 3
;}
; Function -; {
; Location: int.jl:52
  %7 = load i64, i64* %6, align 8
  %8 = sub i64 1, %7
;}
; Location: twiceprecision.jl:446
; Function getproperty; {
; Location: sysimg.jl:18
  %9 = getelementptr inbounds { { double, double }, { double, double }, i64, i64 }, { { double, double }, { double, double }, i64, i64 }* %1, i64 0, i32 1, i32 0
;}
; Function *; {
; Location: promotion.jl:314
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
  %10 = sitofp i64 %8 to double
;}}}}
; Function *; {
; Location: float.jl:399
  %11 = load double, double* %9, align 8
  %12 = fmul double %11, %10
;}}
; Function getproperty; {
; Location: sysimg.jl:18
  %13 = getelementptr inbounds { { double, double }, { double, double }, i64, i64 }, { { double, double }, {double, double }, i64, i64 }* %1, i64 0, i32 1, i32 1
;}
; Function *; {
; Location: promotion.jl:314
; Function *; {
; Location: float.jl:399
  %14 = load double, double* %13, align 8
  %15 = fmul double %14, %10
;}}
; Location: twiceprecision.jl:447
; Function getproperty; {
; Location: sysimg.jl:18
  %16 = getelementptr inbounds { { double, double }, { double, double }, i64, i64 }, { { double, double }, {double, double }, i64, i64 }* %1, i64 0, i32 0, i32 0
;}
; Function add12; {
; Location: twiceprecision.jl:84
; Function abs; {
; Location: float.jl:519
  %17 = call double @llvm.fabs.f64(double %12)
  %18 = load double, double* %16, align 8
  %19 = call double @llvm.fabs.f64(double %18)
;}
; Function >; {
; Location: operators.jl:286
; Function <; {
; Location: float.jl:452
  %20 = fcmp uge double %19, %17
;}}
  %.fca.0.insert = insertvalue [2 x double] undef, double %18, 0
  %.fca.1.insert = insertvalue [2 x double] %.fca.0.insert, double %12, 1
  %.fca.0.insert13 = insertvalue [2 x double] undef, double %12, 0
  %.fca.1.insert14 = insertvalue [2 x double] %.fca.0.insert13, double %18, 1
  %21 = select i1 %20, [2 x double] %.fca.1.insert, [2 x double] %.fca.1.insert14
  %.fca.0.extract = extractvalue [2 x double] %21, 0
  %.fca.1.extract = extractvalue [2 x double] %21, 1
; Location: twiceprecision.jl:85
; Function canonicalize2; {
; Location: twiceprecision.jl:49
; Function +; {
; Location: float.jl:395
  %22 = fadd double %.fca.0.extract, %.fca.1.extract
;}
; Location: twiceprecision.jl:50
; Function -; {
; Location: float.jl:397
  %23 = fsub double %.fca.0.extract, %22
;}
; Function +; {
; Location: float.jl:395
  %24 = fadd double %.fca.1.extract, %23
;}}}
; Location: twiceprecision.jl:448
; Function getproperty; {
; Location: sysimg.jl:18
  %25 = getelementptr inbounds { { double, double }, { double, double }, i64, i64 }, { { double, double }, {double, double }, i64, i64 }* %1, i64 0, i32 0, i32 1
;}
; Function +; {
; Location: float.jl:395
  %26 = load double, double* %25, align 8
  %27 = fadd double %15, %26
  %28 = fadd double %27, %24
  %29 = fadd double %22, %28
;}}}}
; Location: untitled-950019a7864aac9608a43902d7c30748:223
; Function +; {
; Location: promotion.jl:313
; Function +; {
; Location: float.jl:395
  %30 = fadd double %29, 1.000000e+00
;}}
; Function ^; {
; Location: promotion.jl:345
; Function ^; {
; Location: math.jl:767
  %31 = call double @llvm.pow.f64(double -1.000000e+00, double %30)
; Location: math.jl:768
; Function isnan; {
; Location: float.jl:526
; Function !=; {
; Location: float.jl:450
  %32 = fcmp ord double %31, 0.000000e+00
;}}
; Function +; {
; Location: float.jl:395
  %33 = fadd double %30, -1.000000e+00
;}
; Function isnan; {
; Location: float.jl:526
; Function !=; {
; Location: float.jl:450
  %34 = fcmp uno double %33, 0.000000e+00
;}}
  %35 = or i1 %34, %32
  br i1 %35, label %L57.lr.ph, label %L53

L57.lr.ph:                                        ; preds = %L40.L42_crit_edge
  br label %L57

L53:                                              ; preds = %L100, %L40.L42_crit_edge
; Location: math.jl:769
  call void @julia_throw_exp_domainerror_42104(double -1.000000e+00)
  call void @llvm.trap()
  unreachable

L57:                                              ; preds = %L57.lr.ph, %L100
  %36 = phi double [ %31, %L57.lr.ph ], [ %57, %L100 ]
  %value_phi533 = phi i64 [ 2, %L57.lr.ph ], [ %55, %L100 ]
  %value_phi432 = phi double [ %29, %L57.lr.ph ], [ %54, %L100 ]
  %value_phi331 = phi double [ 0.000000e+00, %L57.lr.ph ], [ %40, %L100 ]
;}}
; Function *; {
; Location: promotion.jl:314
; Function *; {
; Location: float.jl:399
  %37 = fmul double %value_phi432, 2.000000e+00
;}}
; Function -; {
; Location: promotion.jl:315
; Function -; {
; Location: float.jl:397
  %38 = fadd double %37, -1.000000e+00
;}}
; Function /; {
; Location: float.jl:401
  %39 = fdiv double %36, %38
;}
; Function +; {
; Location: float.jl:395
  %40 = fadd double %value_phi331, %39
;}
; Function iterate; {
; Location: range.jl:567
; Function <; {
; Location: int.jl:49
  %41 = icmp slt i64 %4, %value_phi533
;}
  br i1 %41, label %L101.loopexit, label %L100

L100:                                             ; preds = %L57
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:445
; Function -; {
; Location: int.jl:52
  %42 = sub i64 %value_phi533, %7
;}
; Location: twiceprecision.jl:446
; Function *; {
; Location: promotion.jl:314
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
  %43 = sitofp i64 %42 to double
;}}}}
; Function *; {
; Location: float.jl:399
  %44 = fmul double %11, %43
  %45 = fmul double %14, %43
;}}
; Location: twiceprecision.jl:447
; Function add12; {
; Location: twiceprecision.jl:84
; Function abs; {
; Location: float.jl:519
  %46 = call double @llvm.fabs.f64(double %44)
;}
; Function >; {
; Location: operators.jl:286
; Function <; {
; Location: float.jl:452
  %47 = fcmp uge double %19, %46
;}}
  %.fca.1.insert24 = insertvalue [2 x double] %.fca.0.insert, double %44, 1
  %.fca.0.insert27 = insertvalue [2 x double] undef, double %44, 0
  %.fca.1.insert28 = insertvalue [2 x double] %.fca.0.insert27, double %18, 1
  %48 = select i1 %47, [2 x double] %.fca.1.insert24, [2 x double] %.fca.1.insert28
  %.fca.0.extract17 = extractvalue [2 x double] %48, 0
  %.fca.1.extract18 = extractvalue [2 x double] %48, 1
; Location: twiceprecision.jl:85
; Function canonicalize2; {
; Location: twiceprecision.jl:49
; Function +; {
; Location: float.jl:395
  %49 = fadd double %.fca.0.extract17, %.fca.1.extract18
;}
; Location: twiceprecision.jl:50
; Function -; {
; Location: float.jl:397
  %50 = fsub double %.fca.0.extract17, %49
;}
; Function +; {
; Location: float.jl:395
  %51 = fadd double %.fca.1.extract18, %50
;}}}
; Location: twiceprecision.jl:448
; Function +; {
; Location: float.jl:395
  %52 = fadd double %45, %26
  %53 = fadd double %52, %51
  %54 = fadd double %49, %53
;}}
; Function +; {
; Location: int.jl:53
  %55 = add i64 %value_phi533, 1
;}}
; Function +; {
; Location: promotion.jl:313
; Function +; {
; Location: float.jl:395
  %56 = fadd double %54, 1.000000e+00
;}}
; Function ^; {
; Location: promotion.jl:345
; Function ^; {
; Location: math.jl:767
  %57 = call double @llvm.pow.f64(double -1.000000e+00, double %56)
; Location: math.jl:768
; Function isnan; {
; Location: float.jl:526
; Function !=; {
; Location: float.jl:450
  %58 = fcmp ord double %57, 0.000000e+00
;}}
; Function +; {
; Location: float.jl:395
  %59 = fadd double %56, -1.000000e+00
;}
; Function isnan; {
; Location: float.jl:526
; Function !=; {
; Location: float.jl:450
  %60 = fcmp uno double %59, 0.000000e+00
;}}
  %61 = or i1 %60, %58
  br i1 %61, label %L57, label %L53

L101.loopexit:                                    ; preds = %L57
;}}
; Location: untitled-950019a7864aac9608a43902d7c30748:225
; Function *; {
; Location: promotion.jl:314
; Function *; {
; Location: float.jl:399
  %phitmp = fmul double %40, 4.000000e+00
  br label %L101

L101:                                             ; preds = %L101.loopexit, %top
  %value_phi9 = phi double [ 0.000000e+00, %top ], [ %phitmp, %L101.loopexit ]
;}}
; Location: untitled-950019a7864aac9608a43902d7c30748:226
  ret double %value_phi9
}

Switching the literals from ints to floats

function g(N)
    s = 0.0
    for k = 1:N+1
        s +=(-1.0)^(k+1.0) / (2.0*k-1.0)
    end
    x = 4*s
    return x
end

makes the Int version perform slightly better than the Float version:

julia> @btime g($N)
  8.903 ms (0 allocations: 0 bytes)
3.1415936535887745

julia> @btime g($n)
  10.327 ms (0 allocations: 0 bytes)
3.1415936535887745

Edit:
Actually only the (-1)^(k+1) operation is relevant here. With -1 an Integer literal (or k+1 an integer) that calls power_by_squaring which is much slower than the LLVM pow intrinsic.

Yes, I see the same effect. When I change the literals to floats, runtime drops from 43ms to 8ms.

But this really goes against my instincts, which are to use integer literals as much as I can, both for readability, generality, and normally also performance. I mean, since when are integer powers faster than float powers?!

Actually, I think I have the culprit!

Integer powers are slow for large exponents:

julia> k = 5
5

julia> k_ = 5.0
5.0

julia> K = 10^6
1000000

julia> K_ = 1e6
1.0e6

julia> @btime (-1)^$k
  5.042 ns (0 allocations: 0 bytes)
-1

julia> @btime (-1)^$k_
  8.638 ns (0 allocations: 0 bytes)
-1.0

julia> @btime (-1)^$K
  14.653 ns (0 allocations: 0 bytes)
1

julia> @btime (-1)^$K_
  8.894 ns (0 allocations: 0 bytes)
1.0

So, the inner loop should really look like this:

s += (-1)^iseven(k) / (2*k-1)

Using ifelse(iseven(k), -1, 1) is equally fast.

2 Likes

An update, here is the fastest reasonable implementation I have found:

function g(N)
    s, a = 0.0, -1
    for k in 1:N+1
        a = -a
        s += a / (2*k-1)
    end
    return 4 * s
end

Intriguingly, the performance of this and its equivalent in Matlab are identical, which caught me by surprise. Of course, no one here can say what’s going on in the Matlab run-time, but does this mean that the Matlab JIT just works super-well on simple cases like this, or is there some performance left on the table for Julia here?

It’s just unexpected to me that Matlab should equal Julia on purely scalar for-loops like this.

2 Likes

With your function, I get:

Int
  120.176 ns (0 allocations: 0 bytes)
  1.200 μs (0 allocations: 0 bytes)
  12.000 μs (0 allocations: 0 bytes)
  119.000 μs (0 allocations: 0 bytes)
  1.180 ms (0 allocations: 0 bytes)
Float
  507.937 ns (0 allocations: 0 bytes)
  3.500 μs (0 allocations: 0 bytes)
  32.000 μs (0 allocations: 0 bytes)
  312.000 μs (0 allocations: 0 bytes)
  3.116 ms (0 allocations: 0 bytes)

for N=10, 1_000, 10_000, 100_000 and 1_000_000 respectively.

Those numbers for Float can be improved rather easily (if not by much for large N) - make every literal a Float in a function just for Floats:

function g(N::T) where T <: AbstractFloat
    s, a = 0.0, -1.0
    for k in 1.0:N+1.0
        a = -a
        s += a / (2.0*k-1.0)
    end
    return 4.0 * s
end

julia> for i in [10.0, 1_000.0, 10_000.0, 100_000.0, 1_000_000.0]
       @btime g($i)
       end
  221.992 ns (0 allocations: 0 bytes)
  3.375 μs (0 allocations: 0 bytes)
  31.000 μs (0 allocations: 0 bytes)
  299.000 μs (0 allocations: 0 bytes)
  2.984 ms (0 allocations: 0 bytes)

If we’re allowed to use multiple dispatch to our advantage, we can even do something as crazy as this (note the cast to Int in case of the Float):

function k(N::T) where T <: Integer
    return 4.0 * sum((iseven(n) ? -1.0 : 1.0) / (2.0 * Float64(n) - 1.0) for n=1:N+1)
end

function k(N::T) where T <: AbstractFloat
    return 4.0 * sum((iseven(n) ? -1.0 : 1.0) / (2.0 * Float64(n) - 1.0) for n=1:Int(N)+1)
end

which brings the times down to this:

kInt
  124.444 ns (0 allocations: 0 bytes)
  1.300 μs (0 allocations: 0 bytes)
  12.000 μs (0 allocations: 0 bytes)
  119.000 μs (0 allocations: 0 bytes)
  1.180 ms (0 allocations: 0 bytes)
kFloat
  126.540 ns (0 allocations: 0 bytes)
  1.300 μs (0 allocations: 0 bytes)
  12.000 μs (0 allocations: 0 bytes)
  119.000 μs (0 allocations: 0 bytes)
  1.180 ms (0 allocations: 0 bytes)

As you can see, there’s a tiny penalty for the N = 10 case, but other than that we’re pretty much on par with the original function, even for floats! I’ve also tried writing this function in more than a single line, but we instantly get allocations for the arrays to be broadcast (just add . to every mathematical operator) so there’s no gain in that.

I did not manage to improve the numbers for Integers - but I may well be overlooking something obvious.

If using multiple dispatch is allowed here, it seems using the general loop for integers and dispatching to the special function for floats would be best (or rather, cast to Int and just call the function for Int).

1 Like

Matlab’s JIT has gotten much better for simple scalar loops since we first published Julia’s microbenchmarks.

2 Likes