Moving the threading to the outermost loop removes the data races, and also runs twice as fast (on my laptop):
function bar!(S, lev)
Threads.@threads for ij in axes(S, 3)
suma = 0.0
counta = 0
for i in axes(S, 2)
for j in axes(S, 1)
if S[j,i,ij,1] > 0.05 && S[j,i,ij,2] > 0
suma += S[j,i,ij,1] / S[j,i,ij,2]
counta += 1
end
end
end
lev[ij] = suma / counta
end
return lev
end
Edit: Sorry! Bad typo in original code!!
The speedup of moving the @threads
macro call is actually >12x.