using Tables
using CSV
using BenchmarkTools
function process1(a::Matrix{Int})
count = 0
for i in 1:size(a, 1)
for j in 1:size(a, 2)
count += a[i, j]
end
end
count
end
function process2(a::Matrix{Int})
count = 0
for j in 1:size(a, 2)
for i in 1:size(a, 1)
count += a[i, j]
end
end
count
end
function process3(a::Matrix{Int})
count = 0
for i in 1:size(a, 1)
count += sum(a[i, :])
end
count
end
function process4(a::Matrix{Int})
count = 0
for j in 1:size(a, 2)
count += sum(a[:, j])
end
count
end
function process5(a::Matrix{Int})
count = sum(a[:, :])
end
function test(f)
a = CSV.File("src/pandas_zeros.csv", type=Int) |> Tables.matrix
f(convert(Matrix{Int}, a))
end
x = test(process1)
@assert test(process2) == x
@assert test(process3) == x
@assert test(process4) == x
@assert test(process5) == x
@btime test($process1)
@btime test($process2)
@btime test($process3)
@btime test($process4)
@btime test($process5)

reports on Julia 1.6.3

32.214 ms (7698 allocations: 77.07 MiB)
29.601 ms (7698 allocations: 77.07 MiB)
40.952 ms (107698 allocations: 124.37 MiB)
34.759 ms (7798 allocations: 115.22 MiB)
39.082 ms (7700 allocations: 115.22 MiB)

Is this expected behaviour or suspicious (or am I doing it wrong)?

NB: I’m using pandas_zeros.csv from the CSV.jl test set.

It will be a lot easier to see what is going on if you benchmark the summation computation separately from importing the CSV file and converting it to a Tables.matrix.

process1 is slower because it has worse cache locality (worse spatial locality = less-consecutive access = poor cache-line utilization), because Matrix is column major (columns are contiguous).

These are much slower because a slice like sum(a[i, :])creates a copy in Julia, so you are allocating little 1d arrays over and over. You can speed things up by using a view, e.g. by putting @views in front of your function declaration. If I do that for all three of these functions, then the allocations go away:

and process4 is faster than process3 because of the above-mentioned spatial locality, while process5 is fastest because it calls an optimized sum routine on the whole array at once.

(Note also that the ::Matrix{Int} argument-type declaration is irrelevant for performance. You would get exactly the same performance if the argument type were not declared at all.)

That I should have imagined, sorry. And thanks very much for the help.

function process3(a::Matrix{Int})
count = 0
for i in 1:size(a, 1)
count += sum(@view a[i, :])
end
count
end
function process4(a::Matrix{Int})
count = 0
for j in 1:size(a, 2)
count += sum(@view a[:, j])
end
count
end
function process5(a::Matrix{Int})
count = sum(@view a[:, :])
end