Hi, there.

Let’s see my code below.

BLAS.set_num_threads(1)

G = 10^9

M = 10^6

```
```@printf(“sz\tbin add(Mflops) bin mul(Mflops) sca add(Mflops) sca mul(Mflops)\n”)

for sz=[100, 200, 500, 1000, 1500, 2000, 2500, 3000, 4000, 8000, 12000]

times1 = Vector{Float64}()

times2 = Vector{Float64}()

times3 = Vector{Float64}()

times4 = Vector{Float64}()

for c=1:20

mat1 = ones(sz, sz)

mat2 = ones(sz, sz)

mat3 = ones(sz, sz)

mat4 = ones(sz, sz)

local r1, r2, r3, r4

s1 = @elapsed r1 = mat1 .+ mat2

s2 = @elapsed r2 = mat3 .* mat4

s3 = @elapsed r3 = mat1 + 11110

s4 = @elapsed r4 = mat2 * 22222

@assert r1[1,1] == 2

@assert r2[1,1] == 1

@assert r3[1,1] == 11111

@assert r4[1,1] == 22222

push!(times1, s1)

push!(times2, s2)

push!(times3, s3)

push!(times4, s4)

end # c

median_time1 = median(times1)

min_time1 = minimum(times1)

median_time2 = median(times2)

min_time2 = minimum(times2)

median_time3 = median(times3)

min_time3 = minimum(times3)

median_time4 = median(times4)

min_time4 = minimum(times4)

ops = sz*sz*

median_Mflops1 = ops/(median_time1M)

max_Mflops1 = ops/(min_time1*M)*

median_Mflops2 = ops/(median_time2M)

max_Mflops2 = ops/(min_time2*M)*

median_Mflops3 = ops/(median_time3M)

max_Mflops3 = ops/(min_time3*M)*

median_Mflops4 = ops/(median_time4M)

max_Mflops4 = ops/(min_time4*M)

@printf(“%d\t”, sz)

@printf(“%.2f\t%.2f\t”, median_Mflops1, max_Mflops1)

@printf(“%.2f\t%.2f\t”, median_Mflops2, max_Mflops2)

@printf(“%.2f\t%.2f\t”, median_Mflops3, max_Mflops3)

@printf(“%.2f\t%.2f\n”, median_Mflops4, max_Mflops4)

`end # sz`

To sum up, operations slow down in the size from 2000 ~ 4000 for each dimension.

And it recovers its speed after 4000~.

As far as I know, operations is fast in the small size due to cache.

But in big size, I think speed should be fixed.

My computer has 32 G RAM, so I think this is not a problem with memory.

Because 3000 by 3000 float64 matrix is only 72 Mbytes.

Does anyone know this problem?