Hello! I’m studying parallel programming in Julia, so i decided to write some basics matrix multiplication without using Linear Algebra package, because it’s already multi-threaded.

```
using Base.Threads
using BenchmarkTools
function multiplyMatrices_oneThreadLoop(A::Matrix{Float64}, B::Matrix{Float64}, N::Int64)
C = zeros(N, N)
Threads.@threads for i in 1:N
for j in 1:N
for k in 1:N
C[i, j]= A[i, k] * B[k, j]
end
end
end
return C
end
function multiplyMatrices_spawnExample(A::Matrix{Float64}, B::Matrix{Float64}, N::Int64)
C = zeros(N, N)
@sync Threads.@spawn for i in 1:N
for j in 1:N
for k in 1:N
C[i, j]= A[i, k] * B[k, j]
end
end
end
return C
end
function multiplyMatrices_default(A::Matrix{Float64}, B::Matrix{Float64}, N::Int64)
C = zeros(N,N)
for i in 1:N
for j in 1:N
for k in 1:N
C[i, j]= A[i, k] * B[k, j]
end
end
end
return C
end
N = 5000
A = rand(N, N);
B = rand(N, N);
println("multi-threaded loop 1st run")
@btime multiplyMatrices_oneThreadLoop(A, B, N)
println("using sync spawn 1st run")
@btime multiplyMatrices_spawnExample(A,B,N)
println("default multiplication 1st run")
@btime multiplyMatrices_default(A, B, N)
println("multi-threaded loop 2nd run")
@btime multiplyMatrices_oneThreadLoop(A, B, N)
println("using sync spawn 2nd run")
@btime multiplyMatrices_spawnExample(A,B,N)
println("default multiplication 2nd run")
@btime multiplyMatrices_default(A, B, N)
println("multi-threaded loop 3rd run")
@btime multiplyMatrices_oneThreadLoop(A, B, N)
println("using sync spawn 3rd run")
@btime multiplyMatrices_spawnExample(A,B,N)
println("default multiplication 3rd run")
@btime multiplyMatrices_default(A, B, N)
```

when i run this code with `julia -t 8`

, i saw that

- Performance of thread.@spawn function is slower than default multiplication
- All functions have higher performance on first run than on repeated runs, why it happens?

Where i did mistakes and how to fix them?