Hello!
I have made a toy script here,
Summary
Toy script example
using StaticArrays
using LoopVectorization
using StructArrays
using BenchmarkTools
struct DimensionalData{D, T}
vectors::Tuple{Vararg{Vector{T}, D}}
V::StructArray{SVector{D, T}, 1, Tuple{Vararg{Vector{T}, D}}}
# General constructor for vectors
function DimensionalData(vectors::Vector{T}...) where {T}
D = length(vectors)
V = StructArray{SVector{D, T}}(vectors)
new{D, T}(Tuple(vectors), V)
end
# Constructor for initializing with all zeros, adapting to dimension D
function DimensionalData{D, T}(len::Int) where {D, T}
vectors = ntuple(d -> zeros(T, len), D) # Create D vectors of zeros
V = StructArray{SVector{D, T}}(vectors)
new{D, T}(vectors, V)
end
end
function updateV!(result::DimensionalData, data::DimensionalData, I, J)
for d β 1:length(result.vectors)
for iter β eachindex(I,J)
i, j = I[iter], J[iter]
result.vectors[d][iter] = data.vectors[d][i] - data.vectors[d][j] # Compute the difference for the d-th dimension
end
end
end
function updateVT!(result::DimensionalData, data::DimensionalData, I, J)
for d β 1:length(result.vectors)
@tturbo for iter β eachindex(I,J)
i, j = I[iter], J[iter]
result.vectors[d][iter] = data.vectors[d][i] - data.vectors[d][j] # Compute the difference for the d-th dimension
end
end
end
function updateVTManual!(result::DimensionalData, data::DimensionalData, I, J)
@tturbo for iter β eachindex(I,J)
i, j = I[iter], J[iter]
result.vectors[1][iter] = data.vectors[1][i] - data.vectors[1][j] # Compute the difference for the d-th dimension
result.vectors[2][iter] = data.vectors[2][i] - data.vectors[2][j] # Compute the difference for the d-th dimension
end
end
# Create a 2D DimensionalData with vectors of length 5
let
D = 2
T = Float64
N = 10000
NL = 500000
data = DimensionalData(rand(N),rand(N))
I = rand(1:N, NL)
J = rand(1:N, NL)
P = DimensionalData{2,Float64}(NL)
println("Naive:"); display(@benchmark updateV!($P,$data,$I,$J))
println("Turbo:"); display(@benchmark updateVT!($P,$data,$I,$J))
println("Turbo Manual Unroll:"); display(@benchmark updateVTManual!($P,$data,$I,$J))
end
to showcase what I am trying to figure out. The issue I have is that I am working with data which can be either 1, 2 or 3 dimensional. The toy script is simply calculating the differences between values. When I benchmark I see the following results:
**Naive:**
BenchmarkTools.Trial: 2290 samples with 1 evaluation.
Range (min β¦ max): 1.544 ms β¦ 6.255 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 1.814 ms β GC (median): 0.00%
Time (mean Β± Ο): 2.170 ms Β± 788.018 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
ββββββ
β
β
ββββββββ β βββββββ β
ββββββββββββββββββββββββββ
ββ
ββββββββββββββββββββ
β
ββββ
β
ββ
ββ
β β
1.54 ms Histogram: log(frequency) by time 4.84 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
**Turbo:**
BenchmarkTools.Trial: 2881 samples with 1 evaluation.
Range (min β¦ max): 1.372 ms β¦ 31.307 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 1.633 ms β GC (median): 0.00%
Time (mean Β± Ο): 1.721 ms Β± 626.874 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
βββββββ
ββββββββββββββ
ββββ
β
β
β
ββββ
β
ββ
βββββββββββββββββββββββββββββββ β
1.37 ms Histogram: frequency by time 2.54 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
**Turbo Manual Unroll:**
BenchmarkTools.Trial: 3856 samples with 1 evaluation.
Range (min β¦ max): 984.900 ΞΌs β¦ 2.480 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 1.219 ms β GC (median): 0.00%
Time (mean Β± Ο): 1.288 ms Β± 238.877 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
ββββββββ
β
βββ
βββ β
ββββββββββββββββββββββββββββββ
βββ
β
ββ
β
ββββββββββββββββββββββββ β
985 ΞΌs Histogram: frequency by time 2.07 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
And I see that while @tturbo
provides a performance boost in both cases, in the case where I manually unroll the loop compared to looping over the dimension, d
, then I lose out on 20ish percent performance.
Is there a way in which I can get the highest speed possible while not having to manually unroll the loop?
Thanks!