The following code cuts down allocations:

```
function sim2(M,R,S,q,numIters,numGrids)
tmp1 = zeros(size(q,2), size(S,2))
tmp2 = zeros(numGrids, numGrids)
tmp3 = zeros(M)
tmp4 = zeros(M)
tmp5 = zeros(size(M,1), size(q,2))
for i = 1:numIters
At_mul_B!(tmp1, q, S)
tmp2 .= R .+ reshape(tmp1, numGrids, numGrids)
tmp3 .= M .- tmp2
tmp4 .= M .+ tmp2
A_mul_B!(tmp5, tmp4, q)
q = tmp3 \ tmp5
end
return q
end
#258.348244 seconds (100.01 k allocations: 782.930 MiB, 0.03% gc time)
```

The bottleneck is the system solve `q = tmp3 \ tmp5`

because it solves the system and allocates for the output every iteration. If you pre-factorize tmp3, you can use the inplace version `A_ldiv_B!`

. The following code is significantly faster but it re-arranges the lines. It is obviously a cheat because it only factorizes once, but if you can cheat like this in your actual code then go for it!

```
function sim3(M,R,S,q,numIters,numGrids)
tmp1 = zeros(size(q,2), size(S,2))
tmp2 = zeros(numGrids, numGrids)
tmp3 = zeros(M)
tmp4 = zeros(M)
tmp5 = zeros(size(M,1), size(q,2))
At_mul_B!(tmp1, q, S)
tmp2 .= R .+ reshape(tmp1, numGrids, numGrids)
tmp3 .= M .- tmp2
tmp4 .= M .+ tmp2
fact = lufact(tmp4)
for i = 1:numIters-1
A_ldiv_B!(q, fact, tmp5)
At_mul_B!(tmp1, q, S)
tmp2 .= R .+ reshape(tmp1, numGrids, numGrids)
tmp3 .= M .- tmp2
tmp4 .= M .+ tmp2
A_mul_B!(tmp5, tmp4, q)
end
A_ldiv_B!(q, fact, tmp5)
return q
end
# 10.253734 seconds (30.02 k allocations: 1.452 MiB)
```

Try these on your machine, and check the timings.