FLoops @init allocate only once

Thanks for all that. I finally got the idea of the channels.

On the other side, this patterns appears to have quite a large some overhead relative to using threadid() (if there is nothing wrong with that code). I mentioned to @Salmon in PV that I have this very simple package ChunkSplitters.jl to exactly help with avoiding the threadid() pattern without that kind of overhead. It is something simple, almost as a manual splitting of the loop into the desired number of chunks. Then his code is written as:

using ChunkSplitters
function test_chunks(arr,Buffer)
    Threads.@threads for (i_range, ichunk) in chunks(axes(arr,1), length(Buffer))
        Buff = Buffer[ichunk]
        for i in i_range, j in axes(arr,2)
            useBuff!(arr,i,j,Buff)
        end
    end
    return arr
end

I keep chasing a more consensual alternative to that, if there is any. I get:

julia> timeall()
@threads with threadid():
  138.663 μs (48 allocations: 4.17 KiB)
chunks from ChunkSplitters:
  139.187 μs (48 allocations: 4.33 KiB)
channels:
  162.981 μs (57 allocations: 4.48 KiB)

With this code:

code
using FLoops
using ChunkSplitters
using BenchmarkTools
using Test

function useBuff!(arr,i,j,Buff)
    Buff .= 0
    for k in eachindex(Buff)
        Buff[k] += k
    end
    arr[i,j] = Buff[i]
end

function test_threadid(arr, Buffer)
    Threads.@threads for i in axes(arr,1)
        Buff = Buffer[Threads.threadid()]
        for j in axes(arr,2)
            useBuff!(arr,i,j,Buff)
        end
    end
    return arr
end

function test_chunks(arr,Buffer)
    Threads.@threads for (i_range, ichunk) in chunks(axes(arr,1), length(Buffer))
        Buff = Buffer[ichunk]
        for i in i_range, j in axes(arr,2)
            useBuff!(arr,i,j,Buff)
        end
    end
    return arr
end

function test_channel(arr,channel)
    Threads.@threads for i in axes(arr,1)
        Buff = take!(channel)
        for j in axes(arr,2)
            useBuff!(arr,i,j,Buff)
        end
        put!(channel, Buff)
    end
    return arr
end

function timeall()
    N = 200
    arr = zeros(N,20)
    Buffer = [zeros(N) for _ in 1:Threads.nthreads()]

    channel = Channel{eltype(Buffer)}(length(Buffer))
    for buff in Buffer
        put!(channel, buff)
    end

    arr0 = copy(arr)
    test_threadid(arr0, Buffer)
    arr1 = copy(arr)
    test_chunks(arr1, Buffer)
    arr2 = copy(arr)
    test_channel(arr2, channel)
    @test isapprox(arr0,arr1)
    @test isapprox(arr0,arr1)

    println("@threads with threadid():")
    @btime test_threadid($arr,$Buffer)
    println("chunks from ChunkSplitters:")
    @btime test_chunks($arr,$Buffer)
    println("channels:")
    @btime test_channel($arr,$channel)

    nothing
end

EDIT: There is an overhead, but not as large. I was taking/puting channels in the inner loop. Thus, it can be nice alternative it the inner operations on each thread are not very fast relative to that overhead. And it is a nice style overall.

1 Like