In a quest to learn good HPC principles, I am trying to understand how to implement cache friendly loops in Julia.

I found this code loop_blocking that compares performance of loop blocking techniques between c and Rust.

I tried implementing the same thing in Julia, but I’m confused by the performance difference I’m getting. I’m writing here to seek help to understand the performance of this code.

```
module LoopBlocking
const MAX::Int64 = 8192
function Fill(A::Array{Int64}, B::Array{Int64})
count::Int64 = 0
@inbounds for i = 1:MAX
for j = 1:MAX
A[i, j] = count
B[i, j] = -count
count += 1
end
end
end
function Add(A::Array{Int64}, B::Array{Int64})
for i = 1:MAX, j = 1:MAX
@inbounds A[i, j] += B[j, i]
end
return nothing
end
function Add_2(A::Array{Int64}, B::Array{Int64})
@inbounds A .+= transpose(B)
return nothing
end
const BLOCK_SIZE::Int64 = 32
function Add_1(A::Array{Int64}, B::Array{Int64})
for i = 1:BLOCK_SIZE:MAX, j = 1:BLOCK_SIZE:MAX
for ii = i:i+BLOCK_SIZE-1, jj = j:j+BLOCK_SIZE-1
@inbounds A[ii, jj] += B[jj, ii]
end
end
return nothing
end
function Add_3(A::Array{Int64}, B::Array{Int64})
for i = 1:BLOCK_SIZE:MAX, j = 1:BLOCK_SIZE:MAX
@inbounds @views A[i:i+BLOCK_SIZE-1, j:j+BLOCK_SIZE-1] .+= transpose(B[j:j+BLOCK_SIZE-1, i:i+BLOCK_SIZE-1])
end
return nothing
end
function main()
A::Array{Int64} = Array{Int64}(undef, MAX, MAX)
B::Array{Int64} = Array{Int64}(undef, MAX, MAX)
Fill(A, B)
Add(A, B)
println("Simple loop")
@time Fill(A, B)
@time Add(A, B)
Correct = copy(A)
Fill(A, B)
Add_1(A, B)
println("Loop blocking")
@time Fill(A, B)
@time Add_1(A, B)
@assert A == Correct
Fill(A, B)
Add_2(A, B)
println("Built-in broadcasting and transpose")
@time Fill(A, B)
@time Add_2(A, B)
@assert A == Correct
Fill(A, B)
Add_3(A, B)
println("Loop blocking with views")
@time Fill(A, B)
@time Add_3(A, B)
@assert A == Correct
end
end
```

These are the results I get

```
Simple loop
1.253246 seconds
0.841699 seconds
Loop blocking
1.238872 seconds
0.458186 seconds
Built-in broadcasting and transpose
1.165040 seconds
0.461195 seconds
Loop blocking with views
1.266697 seconds
0.286722 seconds
```

And here is typical c performance

```
MAX: 8192
BLOCK_SIZE: 32
transpose_0: 528ms
transpose_1: 216ms
```

It seems that broadcasting leads to different performance than simple loops and is the closes to c performance.

Can someone help me understand why simple for loops cannot achieve the same performance as broadcasting?

P.S.: I’m not entirely sure the last function is correct, but the assert seems to work.