I’ve prepared a code that works pretty well in isolation. A minimal working example is below, but I’m basically taking values from a set of input arrays to another set of output arrays.

*Edit:* I’m going to simplify this question to: is there anything I should be doing or thinking in regards to parallelization with this example? Is repeated calls to the parallel segment something that should generate overhead and is there anything to do about it?

Here’s the MWE

```
using Random, BenchmarkTools
totM = 10 #number of original arrays
xsize = 2000 #size of input arrays
ysize = 2000
a = [rand(1:xsize) for i = 1:totM] #making original arrays
b = [rand(1:ysize) for i = 1:totM]
input = [rand(a[i],b[i]) for i = 1:totM]
sa = shuffle(a) #shuffling them for output arrays
sb = shuffle(b)
output = [zeros(sa[i],sb[i]) for i = 1:totM]
destinations = [[[w,i,j] for i = 1:sa[w] for j = 1:sb[w]] for w = 1:totM]#could also be a vector
coords = shuffle(vcat(destinations...)) #rearranges coordinates to copy "input" to "output"
totEl = [sum(w->length(output[w]),1:length(output))] #total elements
storeC = Array{Array{Array{Int64,1},1},1}(undef,totM)
for g = 1:totM
offset = 0
for w = 1:g-1
offset += length(input[w])
end
start = max(offset,1)
endpt = offset + length(input[g])
stop = min(endpt,totEl[1])
storeC[g] = coords[start:stop] #generates a list of where data should be transferred to
end
maxM = totM - sum([length(storeC[g]) > 0 for g = 1:length(storeC)])
function serial(totM::Int64,storeC::Array{Array{Array{Int64,1},1},1},output::Array{Array{Float64,2},1},input::Array{Array{Float64,2},1})
#=Threads.@threads=# for g = 1:totM
dims = size(input[g])
for y = 1:dims[2]
for x = 1:dims[1]
z = x + dims[1] * (y-1)
if z < length(storeC[g])
finalz = storeC[g][z][1]
finalx = storeC[g][z][2]
finaly = storeC[g][z][3]
output[finalz][finalx,finaly] = input[g][x,y]
end
end
end
end
nothing
end
function parallely(totM::Int64,storeC::Array{Array{Array{Int64,1},1},1},output::Array{Array{Float64,2},1},input::Array{Array{Float64,2},1})
for g = 1:totM
dims = size(input[g])
Threads.@threads for y = 1:dims[2]
for x = 1:dims[1]
z = x + dims[1] * (y-1)
if z < length(storeC[g])
finalz = storeC[g][z][1]
finalx = storeC[g][z][2]
finaly = storeC[g][z][3]
output[finalz][finalx,finaly] = input[g][x,y]
end
end
end
end
nothing
end
@btime serial(totM,storeC,output,input) # 2.382 s (0 allocations: 0 bytes)
@btime parallely(totM,storeC,output,input) # 666.258 ms (213 allocations: 16.66 KiB)
```