Hi,

Does anyone know if there are any tricks to using FLoops thread-safe buffers for better performance? I am trying to adapt my code which makes use of Buffers such as `Buffers[Threads.threadid()]`

to the recommended approach of using `FLoops.@init`

.

However I see drastically increased allocations.

I think this is because `@init`

allocates once every time before the parallelized loop runs. If the function that contains this loop is used often it needs to re-allocate the buffers every time.

Is there an elegant way around this?

In case that helps, here is a MWE:

## Summary

```
using FLoops
function useBuff!(arr,i,j,Buff)
Buff .= 0
for k in eachindex(Buff)
Buff[k] += k
end
arr[i,j] = Buff[i]
end
function testBuff(arr,Buffer)
@sync for i in axes(arr,1)
for j in axes(arr,2)
Threads.@spawn begin
Buff = Buffer[Threads.threadid()]
useBuff!(arr,i,j,Buff)
end
end
end
return arr
end
function testFloopsBuff(arr,Buffer)
@floop for i in axes(arr,1), j in axes(arr,2)
Buff = Buffer[Threads.threadid()]
useBuff!(arr,i,j,Buff)
end
return arr
end
function testFloopsNoBuff(arr)
@floop ThreadedEx(basesize = 1) for i in axes(arr,1), j in axes(arr,2)
@init Buff = zeros(size(arr,1))
useBuff!(arr,i,j,Buff)
end
return arr
end
function timeall()
N = 200
arr = zeros(N,20)
@time begin
Buffer = [zeros(N) for _ in 1:Threads.nthreads()]
x = 0.
for i in 1:300
x += maximum(testBuff(arr,Buffer))
end
x
end
@time begin
Buffer = [zeros(N) for _ in 1:Threads.nthreads()]
x = 0.
for i in 1:300
x += maximum(testFloopsBuff(arr,Buffer))
end
x
end
@time begin
x = 0.
for i in 1:300
x += maximum(testFloopsNoBuff(arr))
end
x
end
end
timeall()
timeall()
```