Chairmarks.jl

I think we have some very good reasons to think that f is a slower function than g (namely that f needs to do more work than g, as evidenced by the native and LLVM bytecode for the respective functions).

Here’s some macrobenchmarks I constructed that I think are more representative than your benchmark because they eliminate the reduction you’re doing which can make the differences less apparent:

julia> function macro_benchmark_5!(out, f)
           for j ∈ axes(out, 2)
               x = UInt128(j)
               for i ∈ axes(out, 1)
                   out[i, j] = f(x, i)
               end
           end
       end;

julia> function macro_benchmark_5_noinline!(out, f)
           for j ∈ axes(out, 2)
               x = UInt128(j)
               for i ∈ axes(out, 1)
                   out[i, j] = @noinline f(x, i)
               end
           end
       end;

julia> function macro_benchmark_6!(f, N)
           for j ∈ 1:N
               x = UInt128(j)
               for i ∈ 1:N
                   Base.donotdelete(f(x, i))
               end
           end
       end;

julia> function macro_benchmark_6_noinline!(f, N)
           for j ∈ 1:N
               x = UInt128(j)
               for i ∈ 1:N
                   Base.donotdelete(@noinline f(x, i))
               end
           end
       end;

and here’s the result I see:

julia> f(x, n) = x << n;

julia> g(x, n) = x << (n & 63);

julia> let
           out = Matrix{UInt128}(undef, 10_000, 10_000)
           @time macro_benchmark_5!(out, f)
           @time macro_benchmark_5!(out, g)
           println()
           @time macro_benchmark_5_noinline!(out, f)
           @time macro_benchmark_5_noinline!(out, g)
           println()
           @time macro_benchmark_6!(f, 10_000)
           @time macro_benchmark_6!(g, 10_000)
           println()
           @time macro_benchmark_6_noinline!(f, 10_000)
           @time macro_benchmark_6_noinline!(g, 10_000)
       end
  0.196680 seconds
  0.116391 seconds

  0.581224 seconds
  0.497155 seconds

  0.130048 seconds
  0.129944 seconds

  0.281116 seconds
  0.216197 seconds

consistently demonstrating a performance difference between the two.

Here’s my versioninfo():

julia> versioninfo()
Julia Version 1.10.2
Commit bd47eca2c8a (2024-03-01 10:14 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 12 × AMD Ryzen 5 5600X 6-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 6 default, 0 interactive, 3 GC (on 12 virtual cores)
Environment:
  JULIA_NUM_THREADS = 6
1 Like