Threads parallelization on different nested loops

Hi,

I am learning Julia and I was testing how to parallelize a nested loop with the @threads macro, to compute the product between matrices. This is the code I am trying to parallelize:

function dot_prod!(A, B, C, i, j)
   c = 0 #local object
   for k = 1:size(A, 2)
       c += A[i,k] * B[k,j]
   end
   C[i,j] = c
end
   
function mat_prod!(A, B, C)
   for i = 1:size(A, 1)
       for j = 1:size(B, 2)
           dot_prod!(A, B, C, i, j)
       end
   end
end

I have used the @threads macro at three different levels in the code. This is the first version:

function dot_prod!(A, B, C, i, j)
    c = zeros(Threads.nthreads())
    for k = 1:size(A, 2)
        c[Threads.threadid()] += A[i,k] * B[k,j]
    end
    C[i,j] = sum(c)
end

function mat_prod!(A, B, C)
    Threads.@threads for i = 1:size(A, 1)
        for j = 1:size(B, 2)
            dot_prod!(A, B, C, i, j)
        end
    end
end

The second version:

function dot_prod!(A, B, C, i, j)
    c = zeros(Threads.nthreads())
    for k = 1:size(A, 2)
        c[Threads.threadid()] += A[i,k] * B[k,j]
    end
    C[i,j] = sum(c)
end

function mat_prod!(A, B, C)
    for i = 1:size(A, 1)
        Threads.@threads for j = 1:size(B, 2)
            dot_prod!(A, B, C, i, j)
        end
    end
end

Ant the third version:

function dot_prod!(A, B, C, i, j)
    c = zeros(Threads.nthreads())
    Threads.@threads for k = 1:size(A, 2)
        c[Threads.threadid()] += A[i,k] * B[k,j]
    end
    C[i,j] = sum(c)
end

function mat_prod!(A, B, C)
    for i = 1:size(A, 1)
        for j = 1:size(B, 2)
            dot_prod!(A, B, C, i, j)
        end
    end
end

Now, versions 1 and 3 have similar performance, while version 2, the one in which the middle cycle is parallelized, performs much better (half the time of the other two). I would be grateful if someone could explain to me the reason for that behavior.

Thank you!