I just implemented a custom adjoint for ThreadsX.sum by replacing the map and sum calls in the regular rrule with ThreadsX versions. I am able to achieve a speedup of a factor of 6.5 using 10 threads on an Apple M1 Pro in my use case (and no segfault). I am attaching the implementation. Thank you for the helpful pointer @mcabbot!
using ThreadsX
using ChainRules
using ChainRules: RuleConfig, HasReverseMode, rrule_via_ad, ProjectTo, NoTangent, unthunk
function ChainRules.rrule(
config::RuleConfig{>:HasReverseMode}, ::typeof(ThreadsX.sum), f, xs::AbstractArray)
fx_and_pullbacks = ThreadsX.map(x->rrule_via_ad(config, f, x), xs)
y = ThreadsX.sum(first, fx_and_pullbacks)
pullbacks = ThreadsX.map(last, fx_and_pullbacks)
project = ProjectTo(xs)
function sum_pullback(ȳ)
call(f, x) = f(x)
# if dims is :, then need only left-handed only broadcast
# broadcast_ȳ = dims isa Colon ? (ȳ,) : ȳ
broadcast_ȳ = ȳ
f̄_and_x̄s = ThreadsX.map(f->f(ȳ), pullbacks)
# no point thunking as most of work is in f̄_and_x̄s which we need to compute for both
f̄ = if fieldcount(typeof(f)) === 0 # Then don't need to worry about derivative wrt f
NoTangent()
else
ThreadsX.sum(first, f̄_and_x̄s)
end
x̄s = ThreadsX.map(unthunk ∘ last, f̄_and_x̄s) # project does not support receiving InplaceableThunks
return NoTangent(), f̄, project(x̄s)
end
return y, sum_pullback
end