Sorry, forgot the code:
julia> O1 = Vector{T}(undef, N_vectors); O2 = similar(O1);
julia> random_ddata_p = permutedims(random_ddata, (3, 1, 2));
julia> random_dvector_p = permutedims(random_dvector, (2, 1));
julia> N_vectors
5000
julia> function multiplies_turbo1!(O,M,V,N::Int)
T = eltype(V)
#unlikely simd does anything here
@turbo for i in 1:N
#@inbounds Lvec = V[:,i]
#@inbounds O[i] = Lvec' * (M[:,:,i] * Lvec)
Oi = zero(eltype(O))
for j in axes(V,1), k in axes(V,1)
Oi += V[j,i]*M[k,j,i]*V[k,i]
end
O[i] = Oi
end
return O
end
multiplies_turbo1! (generic function with 1 method)
julia> function multiplies_turbo2!(O,M,V,N::Int)
T = eltype(V)
#unlikely simd does anything here
@turbo for i in 1:N
#@inbounds Lvec = V[:,i]
#@inbounds O[i] = Lvec' * (M[:,:,i] * Lvec)
Oi = zero(eltype(O))
for j in axes(V,2), k in axes(V,2)
Oi += V[i,j]*M[i,k,j]*V[i,k]
end
O[i] = Oi
end
return O
end
multiplies_turbo2! (generic function with 1 method)
julia> multiplies_turbo1!(O1,random_ddata,random_dvector,N_vectors) β multiplies_turbo2!(O2,random_ddata_p,random_dvector_p,N_vectors) β multiplies(random_ddata,random_dvector,N_vectors)
true
julia> @btime multiplies_turbo1!($O1,$random_ddata,$random_dvector,$N_vectors);
8.087 ΞΌs (0 allocations: 0 bytes)
julia> @btime multiplies_turbo2!($O2,$random_ddata_p,$random_dvector_p,$N_vectors);
2.031 ΞΌs (0 allocations: 0 bytes)
Note, using the permuted memory layout makes it about 4x faster. LinuxPerf gives you a clue as to why:
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
foreachf(multiplies_turbo1!, 500_000, O1, random_ddata, random_dvector, N_vectors)
end
βββββββββββββββββββββββββββββββββββββββββββ
βΆ cpu-cycles 1.72e+10 33.3% # 4.0 cycles per ns
β instructions 2.51e+10 33.3% # 1.5 insns per cycle
β branch-instructions 8.35e+07 33.3% # 0.3% of insns
β branch-misses 7.17e+05 33.3% # 0.9% of branch insns
β task-clock 4.31e+09 100.0% # 4.3 s
β context-switches 0.00e+00 100.0%
β cpu-migrations 0.00e+00 100.0%
β page-faults 0.00e+00 100.0%
β L1-dcache-load-misses 2.04e+09 16.7% # 19.8% of dcache loads
β L1-dcache-loads 1.03e+10 16.7%
β L1-icache-load-misses 3.93e+05 16.7%
β dTLB-load-misses 1.80e+01 16.7% # 0.0% of dTLB loads
β dTLB-loads 1.03e+10 16.7%
β iTLB-load-misses 1.44e+03 33.3% # 13.0% of iTLB loads
β iTLB-loads 1.10e+04 33.3%
β cache-misses 9.36e+03 33.3% # 5.5% of cache refs
β cache-references 1.70e+05 33.3%
βββββββββββββββββββββββββββββββββββββββββββ
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
foreachf(multiplies_turbo2!, 500_000, O2, random_ddata_p, random_dvector_p, N_vectors)
end
βββββββββββββββββββββββββββββββββββββββββββ
βΆ cpu-cycles 4.20e+09 33.3% # 4.0 cycles per ns
β instructions 4.57e+09 33.4% # 1.1 insns per cycle
β branch-instructions 2.49e+07 33.4% # 0.5% of insns
β branch-misses 3.72e+03 33.4% # 0.0% of branch insns
β task-clock 1.05e+09 100.0% # 1.1 s
β context-switches 0.00e+00 100.0%
β cpu-migrations 0.00e+00 100.0%
β page-faults 0.00e+00 100.0%
β L1-dcache-load-misses 2.04e+09 16.7% # 81.7% of dcache loads
β L1-dcache-loads 2.50e+09 16.7%
β L1-icache-load-misses 5.42e+04 16.7%
β dTLB-load-misses 5.99e+01 16.7% # 0.0% of dTLB loads
β dTLB-loads 2.50e+09 16.7%
β iTLB-load-misses 2.73e+02 33.3% # 7.3% of iTLB loads
β iTLB-loads 3.74e+03 33.3%
β cache-misses 1.25e+03 33.2% # 6.2% of cache refs
β cache-references 2.02e+04 33.2%
βββββββββββββββββββββββββββββββββββββββββββ
julia> 2.51e10 / 4.57e9
5.49234135667396
Permuting the dimensions like I did reduces the number of instructions needed by a factor of about 5.5.
Note that I decreased the size of N_vectors
to just 5000. With this:
julia> sizeof(T) * (length(random_ddata) + length(random_dvector) + length(O2)) / (1 << 20)
0.247955322265625
The combined size of the arrays is about 0.25 MiB, which is small enough to fit in my CPUβs 1 MiB L2 cache.
With the arrays this small, it isnβt as memory bottle-necked.
The IPC is still low, barely more than 1 in the SIMD case, with more than 80% of L1 cache misses.
Decreasing the array size down to 500, so that they fit in the L1 cacheβ¦
julia> @btime multiplies_turbo1!($O1,$random_ddata,$random_dvector,$N_vectors);
838.746 ns (0 allocations: 0 bytes)
julia> @btime multiplies_turbo2!($O2,$random_ddata_p,$random_dvector_p,$N_vectors);
144.273 ns (0 allocations: 0 bytes)
Now, v2 (with the permuted arrays) is 14x faster than before, or 1.4x faster relative to the number of elements.
The advantage of going to a smaller siz3e seems to be non-existent for already slower v1, which was already bottlenecked on the backend at 1.5 IPC when the array fit in the L2 cache (due to needing a lot of shuffle instructions, which are limited to 1/clock on my dekstop). It stayed at 1.5 IPC when in the L1 cache.
The permuted version, on the other hand, sped up from 1.1 IPC to 1.8 IPC. Again, it also needed many times fewer instructions to get the job done.
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
foreachf(multiplies_turbo1!, 5_000_000, O1, random_ddata, random_dvector, N_vectors)
end
βββββββββββββββββββββββββββββββββββββββββββ
βΆ cpu-cycles 1.76e+10 33.3% # 4.0 cycles per ns
β instructions 2.69e+10 33.3% # 1.5 insns per cycle
β branch-instructions 1.55e+08 33.3% # 0.6% of insns
β branch-misses 1.08e+04 33.3% # 0.0% of branch insns
β task-clock 4.40e+09 100.0% # 4.4 s
β context-switches 0.00e+00 100.0%
β cpu-migrations 0.00e+00 100.0%
β page-faults 0.00e+00 100.0%
β L1-dcache-load-misses 8.66e+08 16.7% # 7.9% of dcache loads
β L1-dcache-loads 1.09e+10 16.7%
β L1-icache-load-misses 8.03e+05 16.7%
β dTLB-load-misses 7.20e+01 16.7% # 0.0% of dTLB loads
β dTLB-loads 1.10e+10 16.7%
β iTLB-load-misses 2.49e+04 33.3% # 133.2% of iTLB loads
β iTLB-loads 1.87e+04 33.3%
β cache-misses 2.38e+03 33.3% # 20.1% of cache refs
β cache-references 1.19e+04 33.3%
βββββββββββββββββββββββββββββββββββββββββββ
julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
foreachf(multiplies_turbo2!, 5_000_000, O2, random_ddata_p, random_dvector_p, N_vectors)
end
βββββββββββββββββββββββββββββββββββββββββββ
βΆ cpu-cycles 3.03e+09 33.2% # 4.0 cycles per ns
β instructions 5.58e+09 33.4% # 1.8 insns per cycle
β branch-instructions 9.47e+07 33.4% # 1.7% of insns
β branch-misses 2.85e+03 33.4% # 0.0% of branch insns
β task-clock 7.59e+08 100.0% # 759.3 ms
β context-switches 0.00e+00 100.0%
β cpu-migrations 0.00e+00 100.0%
β page-faults 0.00e+00 100.0%
β L1-dcache-load-misses 1.46e+08 16.7% # 5.2% of dcache loads
β L1-dcache-loads 2.80e+09 16.7%
β L1-icache-load-misses 5.86e+04 16.7%
β dTLB-load-misses 2.99e+01 16.7% # 0.0% of dTLB loads
β dTLB-loads 2.81e+09 16.7%
β iTLB-load-misses 1.74e+03 33.3% # 67.7% of iTLB loads
β iTLB-loads 2.57e+03 33.3%
β cache-misses 4.01e+02 33.2% # 11.8% of cache refs
β cache-references 3.41e+03 33.2%
βββββββββββββββββββββββββββββββββββββββββββ
So, no, you were not looping in the correct order.
Which order is βcorrectβ is a little more complicated than βcolumns are fastestβ.
In computing the quadratic form Lvec' * Mmat * Lvec
, each operation depends on the previous.
We cannot parallelize the quadratic form.
However, we are doing a huge number of these quadratic forms.
Thus, to speed up the code, the obvious answer is to perform these in parallel via SIMD instructions.
Thus, we want to quickly load multiple Lvec
s and Mmat
s at a time from memory.
That means, we want it to be fast to multiple Lvec
s and Mmat
s. We want each particular element, e.g. Lvec[i]
of the 1st, 2nd, 3rd⦠Lvec
to be next to each other in memory.
We do not want Lvec[i]
to be next to Lvec[i+1]
, as we donβt get to use those two elements in parallel anyway.
In v1, Lvec[i] is next to Lvec[i+1], even though we canβt use these two numbers in parallel. That is why it does a huge amount of work to permute them on the fly into v2βs order (and maybe it does that inefficiently).