I think we have some very good reasons to think that f
is a slower function than g
(namely that f
needs to do more work than g
, as evidenced by the native and LLVM bytecode for the respective functions).
Here’s some macrobenchmarks I constructed that I think are more representative than your benchmark because they eliminate the reduction you’re doing which can make the differences less apparent:
julia> function macro_benchmark_5!(out, f)
for j ∈ axes(out, 2)
x = UInt128(j)
for i ∈ axes(out, 1)
out[i, j] = f(x, i)
end
end
end;
julia> function macro_benchmark_5_noinline!(out, f)
for j ∈ axes(out, 2)
x = UInt128(j)
for i ∈ axes(out, 1)
out[i, j] = @noinline f(x, i)
end
end
end;
julia> function macro_benchmark_6!(f, N)
for j ∈ 1:N
x = UInt128(j)
for i ∈ 1:N
Base.donotdelete(f(x, i))
end
end
end;
julia> function macro_benchmark_6_noinline!(f, N)
for j ∈ 1:N
x = UInt128(j)
for i ∈ 1:N
Base.donotdelete(@noinline f(x, i))
end
end
end;
and here’s the result I see:
julia> f(x, n) = x << n;
julia> g(x, n) = x << (n & 63);
julia> let
out = Matrix{UInt128}(undef, 10_000, 10_000)
@time macro_benchmark_5!(out, f)
@time macro_benchmark_5!(out, g)
println()
@time macro_benchmark_5_noinline!(out, f)
@time macro_benchmark_5_noinline!(out, g)
println()
@time macro_benchmark_6!(f, 10_000)
@time macro_benchmark_6!(g, 10_000)
println()
@time macro_benchmark_6_noinline!(f, 10_000)
@time macro_benchmark_6_noinline!(g, 10_000)
end
0.196680 seconds
0.116391 seconds
0.581224 seconds
0.497155 seconds
0.130048 seconds
0.129944 seconds
0.281116 seconds
0.216197 seconds
consistently demonstrating a performance difference between the two.
Here’s my versioninfo()
:
julia> versioninfo()
Julia Version 1.10.2
Commit bd47eca2c8a (2024-03-01 10:14 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 12 × AMD Ryzen 5 5600X 6-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 6 default, 0 interactive, 3 GC (on 12 virtual cores)
Environment:
JULIA_NUM_THREADS = 6