Help with parallel loop for eigenvalue calculations

In my Stokes solver I need to compute the eigenvalues and eigenvectors of several thousands of small 2x2 matrices. The ij components of this matrices are stored in n-by-6 matrices (F_ij). I’d like to parallelise this code to run in shared and distributed memories but I failed to efficiently do so. The serial code would be like this:

using LinearAlgebra
using StaticArrays
using SharedArrays

n = Int(500e3)
Fxx, Fxz, Fzx, Fzz = rand(n,6),rand(n,6),rand(n,6),rand(n,6)
function fse_par(Fxx, Fxz, Fzx, Fzz)
n   = size(Fxx, 1)
a1s = Array{Float64}(undef,n, 6)
a2s = Array{Float64}(undef,n, 6)
vx1 = Array{Float64}(undef,n, 6)
vy1 = Array{Float64}(undef,n, 6)
vx2 = Array{Float64}(undef,n, 6)
vy2 = Array{Float64}(undef,n, 6)
F   = zeros(MMatrix{2,2})
L   = zeros(MMatrix{2,2})
@inbounds for j = 1:6
       for i = 1:n
        F[1, 1] = Fxx[i, j]
        F[1, 2] = Fxz[i, j]
        F[2, 1] = Fzx[i, j]
        F[2, 2] = Fzz[i, j]
        L      .= F * F'
        # -- Numerical solution for eigenvectors
        evect       = float(eigvecs(L))
        vx1[i, j]   = evect[1, 2]
        vy1[i, j]   = evect[2, 2]
        vx2[i, j]   = evect[1, 1]
        vy2[i, j]   = evect[2, 1]

        # -- Analytical solution for eigenvalues
        α           = (L[1, 1] + L[2, 2]) / 2
        β           = sqrt(((L[1, 1] - L[2, 2])^2) / 4 + L[1, 2] * L[2, 1])
        a1s[i, j]   = sqrt(α + β)
        a2s[i, j]   = sqrt(α - β)

    end
end
return [a1s, a2s, vx1, vx2, vy1, vy2]
end

I have managed to parallelise it using multi-threading as:

function main_par(Fxx, Fxz, Fzx, Fzz)
    n   = size(Fxx, 1)
    a1s = Array{Float64}(undef, n, 6)
    a2s = Array{Float64}(undef, n, 6)
    vx1 = Array{Float64}(undef, n, 6)
    vy1 = Array{Float64}(undef, n, 6)
    vx2 = Array{Float64}(undef, n, 6)
    vy2 = Array{Float64}(undef, n, 6)

    # -- Thread safe arrays
    nt  = Threads.nthreads()
    FF  = [@MMatrix zeros(2,2) for i in 1:nt]
    LL  = [@MMatrix zeros(2, 2) for i in 1:nt]
    D   = [@MMatrix zeros(2, 2) for i in 1:nt]
    α   = [0.0 for i in 1:nt]
    β   = [0.0 for i in 1:nt]

    @inbounds Threads.@threads for u in vec(CartesianIndices((n,6)))
            i,j = u.I

            # -- Build F and L matrices
            FF[Threads.threadid()][1,1] = Fxx[i,j]
            FF[Threads.threadid()][1,2] = Fxz[i,j]
            FF[Threads.threadid()][2,1] = Fzx[i,j]
            FF[Threads.threadid()][2,2] = Fzz[i,j]
            LL[Threads.threadid()]     .= FF[Threads.threadid()]*FF[Threads.threadid()]'

            # # -- Compute eigenvectors
            D[Threads.threadid()]      .= eigvecs(LL[Threads.threadid()])
            vx1[i,j] = D[Threads.threadid()][1,1]
            vx2[i,j] = D[Threads.threadid()][2,1]
            vy1[i,j] = D[Threads.threadid()][1,2]
            vy2[i,j] = D[Threads.threadid()][2,2]

            # -- Analytical solution for eigenvalues
            α[Threads.threadid()] = (LL[Threads.threadid()][1, 1] + LL[Threads.threadid()][2, 2]) / 2
            β[Threads.threadid()] = sqrt(((LL[Threads.threadid()][1, 1] - LL[Threads.threadid()][2, 2])^2) / 4 +
                LL[Threads.threadid()][1, 2] * LL[Threads.threadid()][2, 1])
            a1s[i, j]   = sqrt(α[Threads.threadid()] + β[Threads.threadid()])
            a2s[i, j]   = sqrt(α[Threads.threadid()] - β[Threads.threadid()])

    end

    return [a1s, a2s, vx1, vx2, vy1, vy2] 

end 

However, I don’t really see a speed up. Is there any obvious mistake or optimisation I fail to see?