Reducing Channel Lock Contention with Batches

mrufsvold · November 4, 2024, 2:06am

This has been asked before, but it looks like it didn’t go anywhere. Channels are very powerful, but the lock overhead makes it slow for iterating. Bundling items into batches before take!/put! would help with this overhead.

I wrote up a proof of concept by stealing heavily from Base:

function Base.append!(c::Channel{T}, vec::AbstractArray) where {T}
    current_idx = firstindex(vec)
    final_idx = lastindex(vec)
    final_idx_plus_one = final_idx + 1

    elements_to_add = length(vec)
    # Increment channel n_avail eagerly (before push!) to count data in the
    # buffer as well as offers from tasks which are blocked in wait().
    Base._increment_n_avail(c, elements_to_add)
    while current_idx <= final_idx
        lock(c)
        did_buffer = false
        try
            while length(c.data) == c.sz_max
                Base.check_channel_state(c)
                wait(c.cond_put)
            end
            # Grab a chunk of items that will fit in the channel's buffer
            available_space = c.sz_max - length(c.data)
            next_idx = min(final_idx_plus_one, current_idx + available_space)
            chunk = Iterators.map(x -> convert(T, x), view(vec, current_idx:(next_idx-1)))

            Base.check_channel_state(c)
            append!(c.data, chunk)
            # We successfully added chunk, so decrement our elements to add in case of
            # errors
            elements_to_add -= next_idx - current_idx
            did_buffer = true
            notify(c.cond_take, nothing, true, false)
            # notify all, since some of the waiters may be on a "fetch" call.
            next_idx > final_idx && break
            current_idx = next_idx
        finally
            # Decrement the available items if this task had an exception before pushing the
            # item to the buffer (e.g., during `wait(c.cond_put)`):
            did_buffer || Base._increment_n_avail(c, -elements_to_add)
            unlock(c)
        end
    end
    return c
end


function take_batch!(c::Channel{T}, n) where {T}
    if c.sz_max < n
        throw(ArgumentError("Batch size, $n, is too large for a channel with buffer length $(c.sz_max)"))
    end
    lock(c)
    try
        while isempty(c.data)
            Base.check_channel_state(c)
            wait(c.cond_take)
        end

        take_n = min(n, length(c.data))
        ret = Vector{T}(undef, take_n)
        @inbounds for i in eachindex(ret)
            ret[i] = c.data[i]
        end
        foreach(_ -> popfirst!(c.data), 1:take_n)
        Base._increment_n_avail(c, -take_n)
        notify(c.cond_put, nothing, false, false) # notify only one, since only one slot has become available for a put!.
        return ret
    finally
        unlock(c)
    end
end

Here is a benchmark which indicates to me that this direction has promise:

julia> function bench_basic(item_n, buffer_len)
           items = collect(1:item_n)
           ch = Channel{Int}(buffer_len)
           task_n = Threads.nthreads()
           res = Vector{Int}(undef, item_n * task_n)

           for _ in 1:task_n
               Threads.@spawn begin
                   for j in items
                       put!(ch, j)
                   end
               end
           end

           @sync for i in Base.OneTo(task_n)
               Threads.@spawn let offset = (i - 1) * item_n
                   for j in Base.OneTo(item_n)
                       x = take!(ch)
                       res[offset+j] = x
                   end
               end
           end
           res
       end
bench_basic (generic function with 1 method)

julia> function bench_batch(item_n, buffer_len)
           items = collect(1:item_n)
           ch = Channel{Int}(buffer_len)
           task_n = Threads.nthreads()
           res = Vector{Int}(undef, item_n * task_n)

           ch = Channel{Int}(buffer_len)
           for _ in 1:task_n
               Threads.@spawn begin
                   i = 1
                   while i <= item_n
                       append!(ch, @view items[i:min(i + buffer_len - 1, item_n)])
                       i += buffer_len
                   end
               end
           end

           @sync for i in Base.OneTo(task_n)
               Threads.@spawn let offset = (i - 1) * item_n
                   batch = take_batch!(ch, buffer_len)
                   batch_len = length(batch)
                   batch_i = 1
                   for j in Base.OneTo(item_n)
                       if batch_i > batch_len
                           batch = take_batch!(ch, buffer_len)
                           batch_i = 1
                           batch_len = length(batch)
                       end
                       x = batch[batch_i]
                       res[offset+j] = x
                       batch_i += 1
                   end
               end

           end
           res
       end
bench_batch (generic function with 1 method)

julia> @b bench_basic(10000, 10)
186.438 ms (248 allocs: 724.531 KiB)

julia> @b bench_batch(10000, 10)
62.537 ms (171511 allocs: 4.534 MiB)

julia> @b bench_basic(10000, 100)
104.146 ms (104 allocs: 721.078 KiB)

julia> @b bench_batch(10000, 100)
6.494 ms (157111 allocs: 3.792 MiB)

julia> @b bench_basic(10000, 1000)
101.432 ms (106 allocs: 767.453 KiB)

julia> @b bench_batch(10000, 1000)
3.913 ms (194872 allocs: 4.364 MiB)

Edit: a previous version of this post had incorrect benchmarks. Still, we see that as the buffer size of the channel increases, we get better and better performance.

I have a rough draft for how to do this with unbuffered channels, but it’s still WIP.

Sukera · November 5, 2024, 6:41am

Won’t this deadlock? You’re waiting on the condition variable while holding the lock.

mrufsvold · November 5, 2024, 8:22am

This is exactly how take/put are implemented in Base. I thought the same thing, but if you run the code, you’ll see that, empirically, it doesn’t deadlock. I’m hoping someone with a better understanding of Reentrant locks can help make sense of it.

mrufsvold · November 5, 2024, 12:59pm

FWIW, my guess is that if you wait on a condition that was created with the lock that you have locked, then the scheduler knows you can’t be mutating any of the resources related to the lock, so it’s safe to switch tasks?

mrufsvold · November 6, 2024, 11:59pm

I have a PR up. I have decent test coverage, but I’d love help making sure I haven’t missed an edge case.

github.com/JuliaLang/julia

Batch Channel Operations

JuliaLang:master ← mrufsvold:mr/batch_channel_ops

opened 05:15PM - 06 Nov 24 UTC

mrufsvold

+354 -1

Add `append!(ch, iter)` and `take!(ch, n)` functions that operate on channels in… batches. When using a buffered channel, this significantly reduces the overhead of acquiring the lock of reach `put!/take!`. Here is some benchmarking code: ```julia function bench_basic(item_n, buffer_len) items = collect(1:item_n) ch = Channel{Int}(buffer_len) task_n = Threads.nthreads() res = Vector{Int}(undef, item_n * task_n) for _ in 1:task_n Threads.@spawn begin for j in items put!(ch, j) end end end @sync for i in Base.OneTo(task_n) Threads.@spawn let offset = (i - 1) * item_n for j in Base.OneTo(item_n) x = take!(ch) res[offset+j] = x end end end res end function bench_batch(item_n, buffer_len, batch_size=buffer_len > 0 ? buffer_len : 100) items = collect(1:item_n) ch = Channel{Int}(buffer_len) task_n = Threads.nthreads() res = Vector{Int}(undef, item_n * task_n) for _ in 1:task_n Threads.@spawn begin i = 1 while i <= item_n chunk = @view items[i:min(i + batch_size - 1, item_n)] append!(ch, chunk) i += batch_size end end end @sync for i in Base.OneTo(task_n) Threads.@spawn let offset = (i - 1) * item_n buff = Vector{Int}(undef, batch_size) batch = take!(ch, batch_size, buff) batch_len = length(batch) batch_i = 1 for j in Base.OneTo(item_n) if batch_i > batch_len batch = take!(ch, batch_size, buff) batch_i = 1 batch_len = length(batch) end x = batch[batch_i] res[offset+j] = x batch_i += 1 end end end res end ``` Here are results at different buffer sizes: ```julia julia> using Chairmarks julia> @be bench_basic(10000, 0) Benchmark: 1 sample with 1 evaluation 129.346 ms (76015 allocs: 1.851 MiB) julia> @be bench_batch(10000, 0) Benchmark: 1 sample with 1 evaluation 128.819 ms (76031 allocs: 1.858 MiB) julia> GC.gc() julia> @be bench_basic(10000, 10) Benchmark: 2 samples with 1 evaluation 71.047 ms (104 allocs: 709.797 KiB) 110.748 ms (104 allocs: 709.797 KiB) julia> @be bench_batch(10000, 10) Benchmark: 4 samples with 1 evaluation 23.534 ms (119 allocs: 710.703 KiB) 24.859 ms (119 allocs: 710.703 KiB) 25.958 ms (119 allocs: 710.703 KiB) 29.972 ms (119 allocs: 710.703 KiB) julia> GC.gc() julia> @be bench_basic(10000, 100) Benchmark: 3 samples with 1 evaluation 34.429 ms (107 allocs: 716.844 KiB) 37.284 ms (107 allocs: 716.844 KiB) 37.844 ms (107 allocs: 716.844 KiB) julia> @be bench_batch(10000, 100) Benchmark: 21 samples with 1 evaluation min 3.267 ms (119 allocs: 717.594 KiB) median 3.525 ms (119 allocs: 717.594 KiB) mean 4.724 ms (119 allocs: 717.594 KiB) max 15.609 ms (119 allocs: 717.594 KiB) julia> GC.gc() julia> @be bench_basic(10000, 1000) Benchmark: 2 samples with 1 evaluation 49.205 ms (111 allocs: 763.156 KiB) 113.279 ms (111 allocs: 763.156 KiB) julia> @be bench_batch(10000, 1000) Benchmark: 44 samples with 1 evaluation min 1.658 ms (128 allocs: 780.312 KiB) median 1.889 ms (128 allocs: 780.312 KiB) mean 2.168 ms (128 allocs: 780.312 KiB) max 7.967 ms (128 allocs: 780.312 KiB) ```

Topic		Replies	Views
Improving Channel throughput: batching Internals & Design suggestions	3	657	December 24, 2021
Two question on Channel operation General Usage question	4	458	June 3, 2020
Why do take! and put! for channels allocate memory? Performance memory-allocation , channel	1	826	February 18, 2022
Why `lock` does NOT work for full channel? General Usage question	10	853	August 4, 2021
Confusion about tasks and Channels General Usage question	1	855	August 15, 2017

Reducing Channel Lock Contention with Batches

Related topics