Repeated contractions - Vector-of-StaticArrays vs HybridArrays.jl, discrepancy in performance between Float64 vs Float32

Sorry, forgot the code:

julia> O1 = Vector{T}(undef, N_vectors); O2 = similar(O1);

julia> random_ddata_p = permutedims(random_ddata, (3, 1, 2));

julia> random_dvector_p = permutedims(random_dvector, (2, 1));

julia> N_vectors
5000

julia> function multiplies_turbo1!(O,M,V,N::Int)
           T = eltype(V)
           #unlikely simd does anything here
           @turbo for i in 1:N
               #@inbounds Lvec = V[:,i]
               #@inbounds O[i] = Lvec' * (M[:,:,i] * Lvec)
               Oi = zero(eltype(O))
               for j in axes(V,1), k in axes(V,1)
                   Oi += V[j,i]*M[k,j,i]*V[k,i]
               end
               O[i] = Oi
           end
           return O
       end
multiplies_turbo1! (generic function with 1 method)

julia> function multiplies_turbo2!(O,M,V,N::Int)
           T = eltype(V)
           #unlikely simd does anything here
           @turbo for i in 1:N
               #@inbounds Lvec = V[:,i]
               #@inbounds O[i] = Lvec' * (M[:,:,i] * Lvec)
               Oi = zero(eltype(O))
               for j in axes(V,2), k in axes(V,2)
                   Oi += V[i,j]*M[i,k,j]*V[i,k]
               end
               O[i] = Oi
           end
           return O
       end
multiplies_turbo2! (generic function with 1 method)

julia> multiplies_turbo1!(O1,random_ddata,random_dvector,N_vectors) ≈ multiplies_turbo2!(O2,random_ddata_p,random_dvector_p,N_vectors) ≈ multiplies(random_ddata,random_dvector,N_vectors)
true

julia> @btime multiplies_turbo1!($O1,$random_ddata,$random_dvector,$N_vectors);
  8.087 μs (0 allocations: 0 bytes)

julia> @btime multiplies_turbo2!($O2,$random_ddata_p,$random_dvector_p,$N_vectors);
  2.031 μs (0 allocations: 0 bytes)

Note, using the permuted memory layout makes it about 4x faster. LinuxPerf gives you a clue as to why:

julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
           foreachf(multiplies_turbo1!, 500_000, O1, random_ddata, random_dvector, N_vectors)
       end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles               1.72e+10   33.3%  #  4.0 cycles per ns
┌ instructions             2.51e+10   33.3%  #  1.5 insns per cycle
│ branch-instructions      8.35e+07   33.3%  #  0.3% of insns
└ branch-misses            7.17e+05   33.3%  #  0.9% of branch insns
┌ task-clock               4.31e+09  100.0%  #  4.3 s
│ context-switches         0.00e+00  100.0%
│ cpu-migrations           0.00e+00  100.0%
└ page-faults              0.00e+00  100.0%
┌ L1-dcache-load-misses    2.04e+09   16.7%  # 19.8% of dcache loads
│ L1-dcache-loads          1.03e+10   16.7%
└ L1-icache-load-misses    3.93e+05   16.7%
┌ dTLB-load-misses         1.80e+01   16.7%  #  0.0% of dTLB loads
└ dTLB-loads               1.03e+10   16.7%
┌ iTLB-load-misses         1.44e+03   33.3%  # 13.0% of iTLB loads
└ iTLB-loads               1.10e+04   33.3%
┌ cache-misses             9.36e+03   33.3%  #  5.5% of cache refs
└ cache-references         1.70e+05   33.3%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
           foreachf(multiplies_turbo2!, 500_000, O2, random_ddata_p, random_dvector_p, N_vectors)
       end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles               4.20e+09   33.3%  #  4.0 cycles per ns
┌ instructions             4.57e+09   33.4%  #  1.1 insns per cycle
│ branch-instructions      2.49e+07   33.4%  #  0.5% of insns
└ branch-misses            3.72e+03   33.4%  #  0.0% of branch insns
┌ task-clock               1.05e+09  100.0%  #  1.1 s
│ context-switches         0.00e+00  100.0%
│ cpu-migrations           0.00e+00  100.0%
└ page-faults              0.00e+00  100.0%
┌ L1-dcache-load-misses    2.04e+09   16.7%  # 81.7% of dcache loads
│ L1-dcache-loads          2.50e+09   16.7%
└ L1-icache-load-misses    5.42e+04   16.7%
┌ dTLB-load-misses         5.99e+01   16.7%  #  0.0% of dTLB loads
└ dTLB-loads               2.50e+09   16.7%
┌ iTLB-load-misses         2.73e+02   33.3%  #  7.3% of iTLB loads
└ iTLB-loads               3.74e+03   33.3%
┌ cache-misses             1.25e+03   33.2%  #  6.2% of cache refs
└ cache-references         2.02e+04   33.2%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

julia> 2.51e10 / 4.57e9
5.49234135667396

Permuting the dimensions like I did reduces the number of instructions needed by a factor of about 5.5.

Note that I decreased the size of N_vectors to just 5000. With this:

julia> sizeof(T) * (length(random_ddata) + length(random_dvector) + length(O2)) / (1 << 20)
0.247955322265625

The combined size of the arrays is about 0.25 MiB, which is small enough to fit in my CPU’s 1 MiB L2 cache.
With the arrays this small, it isn’t as memory bottle-necked.
The IPC is still low, barely more than 1 in the SIMD case, with more than 80% of L1 cache misses.

Decreasing the array size down to 500, so that they fit in the L1 cache…

julia> @btime multiplies_turbo1!($O1,$random_ddata,$random_dvector,$N_vectors);
  838.746 ns (0 allocations: 0 bytes)

julia> @btime multiplies_turbo2!($O2,$random_ddata_p,$random_dvector_p,$N_vectors);
  144.273 ns (0 allocations: 0 bytes)

Now, v2 (with the permuted arrays) is 14x faster than before, or 1.4x faster relative to the number of elements.
The advantage of going to a smaller siz3e seems to be non-existent for already slower v1, which was already bottlenecked on the backend at 1.5 IPC when the array fit in the L2 cache (due to needing a lot of shuffle instructions, which are limited to 1/clock on my dekstop). It stayed at 1.5 IPC when in the L1 cache.
The permuted version, on the other hand, sped up from 1.1 IPC to 1.8 IPC. Again, it also needed many times fewer instructions to get the job done.

julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
           foreachf(multiplies_turbo1!, 5_000_000, O1, random_ddata, random_dvector, N_vectors)
       end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles               1.76e+10   33.3%  #  4.0 cycles per ns
┌ instructions             2.69e+10   33.3%  #  1.5 insns per cycle
│ branch-instructions      1.55e+08   33.3%  #  0.6% of insns
└ branch-misses            1.08e+04   33.3%  #  0.0% of branch insns
┌ task-clock               4.40e+09  100.0%  #  4.4 s
│ context-switches         0.00e+00  100.0%
│ cpu-migrations           0.00e+00  100.0%
└ page-faults              0.00e+00  100.0%
┌ L1-dcache-load-misses    8.66e+08   16.7%  #  7.9% of dcache loads
│ L1-dcache-loads          1.09e+10   16.7%
└ L1-icache-load-misses    8.03e+05   16.7%
┌ dTLB-load-misses         7.20e+01   16.7%  #  0.0% of dTLB loads
└ dTLB-loads               1.10e+10   16.7%
┌ iTLB-load-misses         2.49e+04   33.3%  # 133.2% of iTLB loads
└ iTLB-loads               1.87e+04   33.3%
┌ cache-misses             2.38e+03   33.3%  # 20.1% of cache refs
└ cache-references         1.19e+04   33.3%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

julia> @pstats "cpu-cycles,(instructions,branch-instructions,branch-misses),(task-clock,context-switches,cpu-migrations,page-faults),(L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses),(dTLB-load-misses,dTLB-loads),(iTLB-load-misses,iTLB-loads),(cache-misses,cache-references)" begin
           foreachf(multiplies_turbo2!, 5_000_000, O2, random_ddata_p, random_dvector_p, N_vectors)
       end
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
╶ cpu-cycles               3.03e+09   33.2%  #  4.0 cycles per ns
┌ instructions             5.58e+09   33.4%  #  1.8 insns per cycle
│ branch-instructions      9.47e+07   33.4%  #  1.7% of insns
└ branch-misses            2.85e+03   33.4%  #  0.0% of branch insns
┌ task-clock               7.59e+08  100.0%  # 759.3 ms
│ context-switches         0.00e+00  100.0%
│ cpu-migrations           0.00e+00  100.0%
└ page-faults              0.00e+00  100.0%
┌ L1-dcache-load-misses    1.46e+08   16.7%  #  5.2% of dcache loads
│ L1-dcache-loads          2.80e+09   16.7%
└ L1-icache-load-misses    5.86e+04   16.7%
┌ dTLB-load-misses         2.99e+01   16.7%  #  0.0% of dTLB loads
└ dTLB-loads               2.81e+09   16.7%
┌ iTLB-load-misses         1.74e+03   33.3%  # 67.7% of iTLB loads
└ iTLB-loads               2.57e+03   33.3%
┌ cache-misses             4.01e+02   33.2%  # 11.8% of cache refs
└ cache-references         3.41e+03   33.2%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

So, no, you were not looping in the correct order.
Which order is “correct” is a little more complicated than “columns are fastest”.

In computing the quadratic form Lvec' * Mmat * Lvec, each operation depends on the previous.
We cannot parallelize the quadratic form.
However, we are doing a huge number of these quadratic forms.
Thus, to speed up the code, the obvious answer is to perform these in parallel via SIMD instructions.

Thus, we want to quickly load multiple Lvecs and Mmats at a time from memory.
That means, we want it to be fast to multiple Lvecs and Mmats. We want each particular element, e.g. Lvec[i] of the 1st, 2nd, 3rd… Lvec to be next to each other in memory.
We do not want Lvec[i] to be next to Lvec[i+1], as we don’t get to use those two elements in parallel anyway.

In v1, Lvec[i] is next to Lvec[i+1], even though we can’t use these two numbers in parallel. That is why it does a huge amount of work to permute them on the fly into v2’s order (and maybe it does that inefficiently).

5 Likes