In a quest to learn good HPC principles, I am trying to understand how to implement cache friendly loops in Julia.
I found this code loop_blocking that compares performance of loop blocking techniques between c and Rust.
I tried implementing the same thing in Julia, but I’m confused by the performance difference I’m getting. I’m writing here to seek help to understand the performance of this code.
module LoopBlocking
const MAX::Int64 = 8192
function Fill(A::Array{Int64}, B::Array{Int64})
count::Int64 = 0
@inbounds for i = 1:MAX
for j = 1:MAX
A[i, j] = count
B[i, j] = -count
count += 1
end
end
end
function Add(A::Array{Int64}, B::Array{Int64})
for i = 1:MAX, j = 1:MAX
@inbounds A[i, j] += B[j, i]
end
return nothing
end
function Add_2(A::Array{Int64}, B::Array{Int64})
@inbounds A .+= transpose(B)
return nothing
end
const BLOCK_SIZE::Int64 = 32
function Add_1(A::Array{Int64}, B::Array{Int64})
for i = 1:BLOCK_SIZE:MAX, j = 1:BLOCK_SIZE:MAX
for ii = i:i+BLOCK_SIZE-1, jj = j:j+BLOCK_SIZE-1
@inbounds A[ii, jj] += B[jj, ii]
end
end
return nothing
end
function Add_3(A::Array{Int64}, B::Array{Int64})
for i = 1:BLOCK_SIZE:MAX, j = 1:BLOCK_SIZE:MAX
@inbounds @views A[i:i+BLOCK_SIZE-1, j:j+BLOCK_SIZE-1] .+= transpose(B[j:j+BLOCK_SIZE-1, i:i+BLOCK_SIZE-1])
end
return nothing
end
function main()
A::Array{Int64} = Array{Int64}(undef, MAX, MAX)
B::Array{Int64} = Array{Int64}(undef, MAX, MAX)
Fill(A, B)
Add(A, B)
println("Simple loop")
@time Fill(A, B)
@time Add(A, B)
Correct = copy(A)
Fill(A, B)
Add_1(A, B)
println("Loop blocking")
@time Fill(A, B)
@time Add_1(A, B)
@assert A == Correct
Fill(A, B)
Add_2(A, B)
println("Built-in broadcasting and transpose")
@time Fill(A, B)
@time Add_2(A, B)
@assert A == Correct
Fill(A, B)
Add_3(A, B)
println("Loop blocking with views")
@time Fill(A, B)
@time Add_3(A, B)
@assert A == Correct
end
end
These are the results I get
Simple loop
1.253246 seconds
0.841699 seconds
Loop blocking
1.238872 seconds
0.458186 seconds
Built-in broadcasting and transpose
1.165040 seconds
0.461195 seconds
Loop blocking with views
1.266697 seconds
0.286722 seconds
And here is typical c performance
MAX: 8192
BLOCK_SIZE: 32
transpose_0: 528ms
transpose_1: 216ms
It seems that broadcasting leads to different performance than simple loops and is the closes to c performance.
Can someone help me understand why simple for loops cannot achieve the same performance as broadcasting?
P.S.: I’m not entirely sure the last function is correct, but the assert seems to work.