Hello!
I have made a toy script here,
Summary
Toy script example
using StaticArrays
using LoopVectorization
using StructArrays
using BenchmarkTools
struct DimensionalData{D, T}
vectors::Tuple{Vararg{Vector{T}, D}}
V::StructArray{SVector{D, T}, 1, Tuple{Vararg{Vector{T}, D}}}
# General constructor for vectors
function DimensionalData(vectors::Vector{T}...) where {T}
D = length(vectors)
V = StructArray{SVector{D, T}}(vectors)
new{D, T}(Tuple(vectors), V)
end
# Constructor for initializing with all zeros, adapting to dimension D
function DimensionalData{D, T}(len::Int) where {D, T}
vectors = ntuple(d -> zeros(T, len), D) # Create D vectors of zeros
V = StructArray{SVector{D, T}}(vectors)
new{D, T}(vectors, V)
end
end
function updateV!(result::DimensionalData, data::DimensionalData, I, J)
for d ∈ 1:length(result.vectors)
for iter ∈ eachindex(I,J)
i, j = I[iter], J[iter]
result.vectors[d][iter] = data.vectors[d][i] - data.vectors[d][j] # Compute the difference for the d-th dimension
end
end
end
function updateVT!(result::DimensionalData, data::DimensionalData, I, J)
for d ∈ 1:length(result.vectors)
@tturbo for iter ∈ eachindex(I,J)
i, j = I[iter], J[iter]
result.vectors[d][iter] = data.vectors[d][i] - data.vectors[d][j] # Compute the difference for the d-th dimension
end
end
end
function updateVTManual!(result::DimensionalData, data::DimensionalData, I, J)
@tturbo for iter ∈ eachindex(I,J)
i, j = I[iter], J[iter]
result.vectors[1][iter] = data.vectors[1][i] - data.vectors[1][j] # Compute the difference for the d-th dimension
result.vectors[2][iter] = data.vectors[2][i] - data.vectors[2][j] # Compute the difference for the d-th dimension
end
end
# Create a 2D DimensionalData with vectors of length 5
let
D = 2
T = Float64
N = 10000
NL = 500000
data = DimensionalData(rand(N),rand(N))
I = rand(1:N, NL)
J = rand(1:N, NL)
P = DimensionalData{2,Float64}(NL)
println("Naive:"); display(@benchmark updateV!($P,$data,$I,$J))
println("Turbo:"); display(@benchmark updateVT!($P,$data,$I,$J))
println("Turbo Manual Unroll:"); display(@benchmark updateVTManual!($P,$data,$I,$J))
end
to showcase what I am trying to figure out. The issue I have is that I am working with data which can be either 1, 2 or 3 dimensional. The toy script is simply calculating the differences between values. When I benchmark I see the following results:
**Naive:**
BenchmarkTools.Trial: 2290 samples with 1 evaluation.
Range (min … max): 1.544 ms … 6.255 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 1.814 ms ┊ GC (median): 0.00%
Time (mean ± σ): 2.170 ms ± 788.018 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▆█▇▆▆▅▅▅▄▂▃▁▂▂▁▂ ▁ ▁▂▂▁▂▁▁ ▁
███████████████████▆█▇▇█▆▅▇▅▆████████▆██▇█▇▆▇▇▆▅▅▄▄▆▅▅▄▅▄▅▆ █
1.54 ms Histogram: log(frequency) by time 4.84 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
**Turbo:**
BenchmarkTools.Trial: 2881 samples with 1 evaluation.
Range (min … max): 1.372 ms … 31.307 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 1.633 ms ┊ GC (median): 0.00%
Time (mean ± σ): 1.721 ms ± 626.874 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▆█▆▂▁▂▂
▃████████▇▇▇▇▅▇▇▆▅▅▅▅▆▄▆▅▅▆▅▄▃▄▄▃▄▃▃▃▃▃▃▃▂▂▃▂▁▂▂▂▂▁▂▂▁▁▁▁▁▁ ▃
1.37 ms Histogram: frequency by time 2.54 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
**Turbo Manual Unroll:**
BenchmarkTools.Trial: 3856 samples with 1 evaluation.
Range (min … max): 984.900 μs … 2.480 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 1.219 ms ┊ GC (median): 0.00%
Time (mean ± σ): 1.288 ms ± 238.877 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃▆▆█▆▇▆▅▅▄▃▅▂▃▁ ▁
▄██████████████████▇█▇█▆▆▆▇▆▆▅▆▆▅▅▄▅▅▄▄▄▄▄▃▄▃▃▃▃▃▃▃▃▂▃▂▂▁▂▂▂▂ ▅
985 μs Histogram: frequency by time 2.07 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
And I see that while @tturbo
provides a performance boost in both cases, in the case where I manually unroll the loop compared to looping over the dimension, d
, then I lose out on 20ish percent performance.
Is there a way in which I can get the highest speed possible while not having to manually unroll the loop?
Thanks!