Matrix Element-wise operations slow down in a certain size interval

Hi, there.
Let’s see my code below.

BLAS.set_num_threads(1)
G = 10^9
M = 10^6

@printf(“sz\tbin add(Mflops) bin mul(Mflops) sca add(Mflops) sca mul(Mflops)\n”)

for sz=[100, 200, 500, 1000, 1500, 2000, 2500, 3000, 4000, 8000, 12000]
times1 = Vector{Float64}()
times2 = Vector{Float64}()
times3 = Vector{Float64}()
times4 = Vector{Float64}()
for c=1:20
mat1 = ones(sz, sz)
mat2 = ones(sz, sz)
mat3 = ones(sz, sz)
mat4 = ones(sz, sz)
local r1, r2, r3, r4

s1 = @elapsed r1 = mat1 .+ mat2
s2 = @elapsed r2 = mat3 .* mat4
s3 = @elapsed r3 = mat1 + 11110
s4 = @elapsed r4 = mat2 * 22222

@assert r1[1,1] == 2
@assert r2[1,1] == 1
@assert r3[1,1] == 11111
@assert r4[1,1] == 22222

push!(times1, s1)
push!(times2, s2)
push!(times3, s3)
push!(times4, s4)

end # c

median_time1 = median(times1)
min_time1 = minimum(times1)
median_time2 = median(times2)
min_time2 = minimum(times2)
median_time3 = median(times3)
min_time3 = minimum(times3)
median_time4 = median(times4)
min_time4 = minimum(times4)

ops = szsz
median_Mflops1 = ops/(median_time1
M)
max_Mflops1 = ops/(min_time1M)
median_Mflops2 = ops/(median_time2
M)
max_Mflops2 = ops/(min_time2M)
median_Mflops3 = ops/(median_time3
M)
max_Mflops3 = ops/(min_time3M)
median_Mflops4 = ops/(median_time4
M)
max_Mflops4 = ops/(min_time4*M)

@printf(“%d\t”, sz)
@printf(“%.2f\t%.2f\t”, median_Mflops1, max_Mflops1)
@printf(“%.2f\t%.2f\t”, median_Mflops2, max_Mflops2)
@printf(“%.2f\t%.2f\t”, median_Mflops3, max_Mflops3)
@printf(“%.2f\t%.2f\n”, median_Mflops4, max_Mflops4)

end # sz

To sum up, operations slow down in the size from 2000 ~ 4000 for each dimension.
And it recovers its speed after 4000~.
As far as I know, operations is fast in the small size due to cache.
But in big size, I think speed should be fixed.
My computer has 32 G RAM, so I think this is not a problem with memory.
Because 3000 by 3000 float64 matrix is only 72 Mbytes.
Does anyone know this problem?

I partly solved this problem by assigning useless variable to 0 and gc() before the computation for time measure.
But the size range from 2500 to 4000 still slower than other range even a range is bigger size than 4000.