Hi, there!

First, I show you my example.

BLAS.set_num_threads(1)

block_mat1 = Matrix(1,1)

block_mat2 = Matrix(2,2)

block_mat3 = Matrix(3,3)

block_mat4 = Matrix(4,4)

block_mat1[1,1] = ones(3000,3000)

for i=1:2

for j=1:2

block_mat2[i,j] = ones(1500,1500)

end

end

for i=1:3

for j=1:3

block_mat3[i,j] = ones(1000,1000)

end

end

for i=1:4

for j=1:4

block_mat4[i,j] = ones(750, 750)

end

end

seconds_mat1 = Vector{Float64}()

seconds_mat2 = Vector{Float64}()

seconds_mat3 = Vector{Float64}()

seconds_mat4 = Vector{Float64}()

for #_=1:10

push!(seconds_mat1, @elapsed block_mat1*block_mat1)
push!(seconds_mat2, @elapsed block_mat2*block_mat2)

push!(seconds_mat3, @elapsed block_mat3

*block_mat3)*

push!(seconds_mat4, @elapsed block_mat4block_mat4)

push!(seconds_mat4, @elapsed block_mat4

end

@printf("%.6f\t%.6f\t%.6f\t%.6f\n", median(seconds_mat1), median(seconds_mat2), median(seconds_mat3), median(seconds_mat4))

All block_mat# have the same number of elements and the multiplications also have the same number of computations too.

But the execution time of mat1 and mat4 is especially slow and you will know if you block more than 4 in the row and column, it shows you slow execution times.

Namely, no partition and more than 16 partitions happen slow execution time.

Butâ€¦

function mul(a::Matrix, b::Matrix)

@assert size(a,2) == size(b,1)

m = size(a,1)

n = size(b,2)

d = size(a,2)

res = Matrix(m, n)

for i=1:m

for j=1:n

res[i,j] = a[i,1]*b[1,j]

for k=2:d

res[i,j] += a[i,k]*b[k,j]

end

end

end

return res

end

The above mul function is my custom matrix multiplication function between two matrix type.

If you use mul function instead of * operator, you will get a proper execution time.

Why does this happen? is it a bug? and I have tried type annotated for matrix, but it failed to get a proper execution time.