I think there’s benefit to having both. Having your macro return a branch means that the body of the function is ‘bigger’ and as such Julia’s optimizer is more likely to give up early.
Here’s what I mean by bigger:
julia> macro usethreads_branch(multithreaded, expr::Expr)
ex = quote
if $multithreaded
Threads.@threads $expr
else
$expr
end
end
esc(ex)
end
@usethreads_branch (macro with 1 method)
julia> macro usethreads_nobranch(multithreaded::Bool, expr::Expr)
if multithreaded
return esc(:(Threads.@threads $expr))
else
return esc(:($expr))
end
end
@usethreads_nobranch (macro with 1 method)
julia> function f1!(v)
@usethreads_branch false for i ∈ eachindex(v)
v[i] = v[i] + 1
end
end
f1! (generic function with 1 method)
julia> function f2!(v)
@usethreads_nobranch false for i ∈ eachindex(v)
v[i] = v[i] + 1
end
end
f2! (generic function with 1 method)
julia> @code_lowered f1!([1,2,3])
CodeInfo(
1 ── Core.NewvarNode(:(threadsfor_fun))
│ Core.NewvarNode(:(@_4))
└─── goto #9 if not false
2 ── range = Main.eachindex(v)
│ %5 = Main.:(var"#67#threadsfor_fun#13")
│ %6 = Core.typeof(v)
│ %7 = Core.typeof(range)
│ %8 = Core.apply_type(%5, %6, %7)
│ threadsfor_fun = %new(%8, v, range)
│ %10 = Base.Threads.threadid()
│ %11 = %10 != 1
└─── goto #4 if not %11
3 ── @_7 = %11
└─── goto #5
4 ── %15 = $(Expr(:foreigncall, :(:jl_in_threaded_region), Int32, svec(), 0, :(:ccall)))
└─── @_7 = %15 != 0
5 ┄─ goto #7 if not @_7
6 ── %18 = Base.invokelatest
│ %19 = threadsfor_fun
│ (%18)(%19, true)
└─── goto #8
7 ── Base.Threads.threading_run(threadsfor_fun)
8 ┄─ %23 = Base.Threads.nothing
└─── return %23
9 ── %25 = Main.eachindex(v)
│ @_4 = Base.iterate(%25)
│ %27 = @_4 === nothing
│ %28 = Base.not_int(%27)
└─── goto #12 if not %28
10 ┄ %30 = @_4
│ i = Core.getfield(%30, 1)
│ %32 = Core.getfield(%30, 2)
│ %33 = Base.getindex(v, i)
│ %34 = %33 + 1
│ Base.setindex!(v, %34, i)
│ @_4 = Base.iterate(%25, %32)
│ %37 = @_4 === nothing
│ %38 = Base.not_int(%37)
└─── goto #12 if not %38
11 ─ goto #10
12 ┄ return
)
julia> @code_lowered f2!([1,2,3])
CodeInfo(
1 ─ %1 = Main.eachindex(v)
│ @_3 = Base.iterate(%1)
│ %3 = @_3 === nothing
│ %4 = Base.not_int(%3)
└── goto #4 if not %4
2 ┄ %6 = @_3
│ i = Core.getfield(%6, 1)
│ %8 = Core.getfield(%6, 2)
│ %9 = Base.getindex(v, i)
│ %10 = %9 + 1
│ Base.setindex!(v, %10, i)
│ @_3 = Base.iterate(%1, %8)
│ %13 = @_3 === nothing
│ %14 = Base.not_int(%13)
└── goto #4 if not %14
3 ─ goto #2
4 ┄ return
)
Notice that the lowered code for f2!
is much smaller than f1!
. In this case, it should have no performance impact, but I believe there will be cases with more complicated functions where this can cause a substantial performance degradation because it’ll prevent in-lining or cause the optimizer to give up.
My understanding is that Julia’s compiler essentially has a cost model while it tries to apply optimizations and if it spends too long trying to optimize a function it will give up, meaning that a very big function can sometimes be slower than micro-benchmarks of it’s constituent parts would suggest.