This topic is a follow-up of this one: Improve performance of matrix computation where @foobar_lv2 proposed an improved version for the computation of a matrix given some three other matrices x, \mu and \sigma of respective sizes D \times N, D \times K and D \times K.
By trying to integrate this function to my project, I found out a very strange behaviour when the number D was defined two times by looking at the size of the matrices. The following MWE shows what’s wrong, the only change between f1
and f2
is that _, N = size(x)
is replaced with D, N = size(x)
so that D
is defined two times. This results in a multiplication of several 1,000s of the overall computing time, regardless of the number of threads used.
N = 100
D = 1000
K = 5
x = rand(D, N)
μ = rand(D, K)
σ = rand(D, K)
x0 = x[1:2,1:2]
μ0 = μ[1:2,1:2]
σ0 = σ[1:2,1:2]
function f1(x::Matrix, μ::Matrix, σ::Matrix)
D, K = size(μ)
_, N = size(x)
p = zeros(N, K)
@fastmath Threads.@threads for k = 1:K
@inbounds for n = 1:N
pnk = 0.0
@simd for d = 1:D
pnk -= (x[d,n] - μ[d,k]) * (x[d,n] - μ[d,k])
end
p[n,k] = pnk
end
end
return p
end
function f2(x::Matrix, μ::Matrix, σ::Matrix)
D, K = size(μ)
D, N = size(x)
p = zeros(N, K)
@fastmath Threads.@threads for k = 1:K
@inbounds for n = 1:N
pnk = 0.0
@simd for d = 1:D
pnk -= (x[d,n] - μ[d,k]) * (x[d,n] - μ[d,k])
end
p[n,k] = pnk
end
end
return p
end
p10 = f1(x0, μ0, σ0)
p20 = f2(x0, μ0, σ0)
function print_name(name, space = 3)
print(name)
for i = 1:space-length(name) print(" ") end
end
print_name("f1"); @time p1 = f1(x, μ, σ)
print_name("f2"); @time p2 = f2(x, μ, σ)
On my computer, with JULIA_NUM_THREADS=1
the previous code gives:
f1 0.000192 seconds (87 allocations: 10.607 KiB)
f2 0.356826 seconds (4.49 M allocations: 68.570 MiB, 53.96% gc time)
and with JULIA_NUM_THREADS=4
:
f1 0.000124 seconds (87 allocations: 10.607 KiB)
f2 0.203320 seconds (2.25 M allocations: 34.356 MiB, 28.90% gc time)
Does anyone know why this happens? Many thanks.