I think the best way is to use an example. Say we want to calculate the matrix dot product (Gram) `x'x`

for very large (long) matrices where the calculation may not fit into memory, we could carry out the calculating in blocks, we could also parallelize the calculation using threads, the code is below. For some reason Iâ€™m now getting around x5.5 faster with the same data load as before:

```
using Base.Threads
using LinearAlgebra.BLAS: set_num_threads
nthreads()
# 12
# Generate the block matrices
function makeNestedMatrix(n::Integer = 1000, p::Integer = 100, chunk = 1000)
x = Array{Array{Float64, 2}}(undef, n)
for i in 1:n
x[i] = rand(Float64, chunk, p)
end
return x
end
function threadedGram(x::Array{Array{Float64, 2}})
p = size(x[1])[2]
n = length(x)
z = zeros(Float64, p, p)
@threads for i in 1:n
z .+= x[i]' * x[i]
end
return z
end
function serialGram(x::Array{Array{Float64, 2}})
p = size(x[1])[2]
n = length(x)
z = zeros(Float64, p, p)
for i in 1:n
z .+= x[i]' * x[i]
end
return z
end
x = makeNestedMatrix(10_000, 100, 1000);
set_num_threads(1)
@time serialGram(x);
# 4.238534 seconds (20.01 k allocations: 763.779 MiB, 21.82% gc time)
@time threadedGram(x);
# 0.765528 seconds (20.10 k allocations: 763.788 MiB, 0.58% gc time)
# Check with native matrix multiplication
# Generate the dense matrix for comparison
y = zeros(size(x[1])[1]*length(x), size(x[1])[2]);
chunk = size(x[1])[1];
for i in 1:length(x)
y[((i - 1)*chunk + 1):(i*chunk),:] .= x[i]
end
set_num_threads(12)
@time y' * y;
# 3.229251 seconds (7 allocations: 78.375 KiB)
```