Using LinuxPerf:

```
julia> using LinuxPerf
julia> function matrix_0_1_rows(M)
m, n = size(M)
for i in 1:m
for j in 1:n
M[i,j] = 1
end
end
end
matrix_0_1_rows (generic function with 1 method)
julia> function matrix_0_1_cols(M)
m, n = size(M)
for j in 1:n
for i in 1:m
M[i,j] = 1
end
end
end
matrix_0_1_cols (generic function with 1 method)
julia> foreachf(f::F, N, args::Vararg{<:Any,A}) where {F,A} = foreach(_ -> f(args...), Base.OneTo(N))
foreachf (generic function with 1 method)
julia> M = zeros(64,64);
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads)" begin
foreachf(matrix_0_1_rows, 1_000_000, M)
end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles 8.74e+09 50.0% # 4.3 cycles per ns
┌ instructions 1.81e+10 75.0% # 2.1 insns per cycle
│ branch-instructions 4.42e+09 75.0% # 24.4% of instructions
└ branch-misses 1.21e+08 75.0% # 2.7% of branch instructions
┌ task-clock 2.04e+09 100.0% # 2.0 s
│ context-switches 0.00e+00 100.0%
│ cpu-migrations 0.00e+00 100.0%
└ page-faults 0.00e+00 100.0%
┌ L1-dcache-load-misses 1.95e+08 25.0% # 103.7% of dcache loads
│ L1-dcache-loads 1.88e+08 25.0%
└ L1-icache-load-misses 7.59e+04 25.0%
┌ dTLB-load-misses 1.36e+02 25.0% # 0.0% of dTLB loads
└ dTLB-loads 1.88e+08 25.0%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads)" begin
foreachf(matrix_0_1_cols, 1_000_000, M)
end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles 1.76e+09 49.8% # 4.3 cycles per ns
┌ instructions 2.35e+09 74.9% # 1.3 insns per cycle
│ branch-instructions 3.97e+08 74.9% # 16.9% of instructions
└ branch-misses 1.01e+06 74.9% # 0.3% of branch instructions
┌ task-clock 4.10e+08 100.0% # 410.2 ms
│ context-switches 0.00e+00 100.0%
│ cpu-migrations 0.00e+00 100.0%
└ page-faults 0.00e+00 100.0%
┌ L1-dcache-load-misses 2.93e+08 25.1% # 149.1% of dcache loads
│ L1-dcache-loads 1.96e+08 25.1%
└ L1-icache-load-misses 1.58e+04 25.1%
┌ dTLB-load-misses 1.89e+02 24.9% # 0.0% of dTLB loads
└ dTLB-loads 1.96e+08 24.9%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
julia> 2.35e9 / 1.81e10
0.1298342541436464
```

This code called the function `1_000_000`

times, and gives us a summary of details like the number of instructions executed.

Note there was >= 100% cache misses, because I was using 64x64 matrices, and this CPU has a 32 KiB L1d cache (and `64^2*sizeof(Float64) == 32*(2^10)`

).

Importantly, thanks to the SIMD instructions, the `col`

version required many times fewer instructions, just 0.13 times as many.

However, all those cache misses meant it didn’t achieve nearly as many instructions per clock cycle. It spent a lot of time waiting for memory.

With 32x32, it easily fits in cache, so the instructions per clock are much better, but 32 isn’t enough for the SIMD version to stretch its legs (the SIMD loop iterates just once per iteration on this computer), so the advantage in number of instructions needed is less (and increasing repetitions to 10 million):

```
julia> M = zeros(32,32);
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads)" begin
foreachf(matrix_0_1_rows, 10_000_000, M)
end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles 1.47e+10 50.0% # 4.3 cycles per ns
┌ instructions 5.18e+10 75.0% # 3.5 insns per cycle
│ branch-instructions 1.22e+10 75.0% # 23.6% of instructions
└ branch-misses 1.72e+07 75.0% # 0.1% of branch instructions
┌ task-clock 3.42e+09 100.0% # 3.4 s
│ context-switches 0.00e+00 100.0%
│ cpu-migrations 0.00e+00 100.0%
└ page-faults 0.00e+00 100.0%
┌ L1-dcache-load-misses 9.43e+04 25.0% # 0.0% of dcache loads
│ L1-dcache-loads 1.56e+09 25.0%
└ L1-icache-load-misses 1.01e+05 25.0%
┌ dTLB-load-misses 9.92e+02 25.0% # 0.0% of dTLB loads
└ dTLB-loads 1.56e+09 25.0%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads)" begin
foreachf(matrix_0_1_cols, 10_000_000, M)
end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles 3.01e+09 49.9% # 4.3 cycles per ns
┌ instructions 1.15e+10 75.0% # 3.8 insns per cycle
│ branch-instructions 2.00e+09 75.0% # 17.4% of instructions
└ branch-misses 1.91e+04 75.0% # 0.0% of branch instructions
┌ task-clock 7.03e+08 100.0% # 703.0 ms
│ context-switches 0.00e+00 100.0%
│ cpu-migrations 0.00e+00 100.0%
└ page-faults 0.00e+00 100.0%
┌ L1-dcache-load-misses 2.01e+04 25.0% # 0.0% of dcache loads
│ L1-dcache-loads 1.55e+09 25.0%
└ L1-icache-load-misses 2.87e+04 25.0%
┌ dTLB-load-misses 2.50e+03 24.9% # 0.0% of dTLB loads
└ dTLB-loads 1.55e+09 24.9%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
```

But the relatively short inner loop means there’s a lot of overhead in entering and starting that loop compared to using linear indexing:

```
julia> function matrix_0_1_linear(M)
for i in eachindex(M)
M[i] = 1
end
end
matrix_0_1_linear (generic function with 1 method)
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads)" begin
foreachf(matrix_0_1_linear, 10_000_000, M)
end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles 2.40e+09 49.9% # 4.3 cycles per ns
┌ instructions 6.56e+09 75.0% # 2.7 insns per cycle
│ branch-instructions 1.12e+09 75.0% # 17.1% of instructions
└ branch-misses 8.86e+04 75.0% # 0.0% of branch instructions
┌ task-clock 5.62e+08 100.0% # 561.6 ms
│ context-switches 0.00e+00 100.0%
│ cpu-migrations 0.00e+00 100.0%
└ page-faults 0.00e+00 100.0%
┌ L1-dcache-load-misses 2.67e+04 25.0% # 0.0% of dcache loads
│ L1-dcache-loads 1.33e+09 25.0%
└ L1-icache-load-misses 3.37e+04 25.0%
┌ dTLB-load-misses 4.67e+04 24.9% # 0.0% of dTLB loads
└ dTLB-loads 1.33e+09 24.9%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
```