Hi!
I am trying to build a lower diagonal storage to reduce the memory footprint of some applications. In my case, when computing spherical harmonics, the coefficients are usual placed in a lower triangular matrix. This is useful for many applications like computing the geomagnetic field and gravity acceleration.
My current implementation is:
abstract type AbstractDataAlignment end
struct RowMajor <: AbstractDataAlignment end
struct ColumnMajor <: AbstractDataAlignment end
struct LowerTriangularStorage{Ta, Tt} <: AbstractMatrix{Tt}
@static if VERSION >= v"1.11-"
data::Memory{Tt}
else
data::Vector{Tt}
end
n::Int
function LowerTriangularStorage{Ta, Tt}(n::Int) where {Ta<:AbstractDataAlignment, Tt}
n < 1 && throw(ArgumentError("Matrix size must be positive"))
len = (n * (n + 1)) Γ· 2
data = @static if VERSION >= v"1.11-"
Memory{Tt}(undef, len)
else
Vector{Tt}(undef, len)
end
new{Ta, Tt}(data, n)
end
function LowerTriangularStorage{Ta}(n::Int) where Ta <: AbstractDataAlignment
return LowerTriangularStorage{Ta, Float64}(n)
end
function LowerTriangularStorage{T}(n::Int) where T
return LowerTriangularStorage{ColumnMajor, T}(n)
end
function LowerTriangularStorage(n::Int)
return LowerTriangularStorage{ColumnMajor, Float64}(n)
end
end
@inline Base.@propagate_inbounds function Base.getindex(
L::LowerTriangularStorage{Ta, Tt},
i::Int,
j::Int
) where {Ta <: AbstractDataAlignment, Tt}
Base.@boundscheck (i < 1 || j < 1 || i > L.n || j > L.n || j > i) &&
Base.throw_boundserror(L, (i, j))
return (@inbounds L.data[_axes_to_index(L, i, j)])
end
@inline Base.@propagate_inbounds function Base.setindex!(
L::LowerTriangularStorage{Ta, Tt},
v,
i::Int,
j::Int
) where {Ta <: AbstractDataAlignment, Tt}
Base.@boundscheck (i < 1 || j < 1 || i > L.n || j > L.n || j > i) &&
Base.throw_boundserror(L, (i, j))
@inbounds L.data[_axes_to_index(L, i, j)] = v
return L
end
Base.size(L::LowerTriangularStorage) = (L.n, L.n)
function Base.zeros(::Type{T}, n::Int) where T <: LowerTriangularStorage
L = T(n)
L.data .= zero(eltype(L.data))
return L
end
function Base.replace_in_print_matrix(::LowerTriangularStorage, i::Int, j::Int, s::AbstractString)
i >= j && return s
s == "#undef" && return " Γ" * repeat(" ", length(s) - 3)
return repeat(" ", length(s))
end
@inline function _axes_to_index(::LowerTriangularStorage{RowMajor}, i::Integer, j::Integer)
return (i * (i - 1)) Γ· 2 + j
end
@inline function _axes_to_index(L::LowerTriangularStorage{ColumnMajor}, i::Integer, j::Integer)
return ((j - 1) * (2L.n - j) + 2i) Γ· 2
end
For very large matrices, we have a huge gain (50%) as we can see by computing the Legendre coefficients:
julia> using SatelliteToolboxLegendre, BenchmarkTools
julia> P = LowerTriangularStorage{RowMajor}(2200);
julia> dP = LowerTriangularStorage{RowMajor}(2200);
julia> Pm = zeros(2200, 2200);
julia> dPm = zeros(2200, 2200);
julia> legendre!(Val(:full), P, 0.4)
julia> legendre!(Val(:full), Pm, 0.4)
julia> @benchmark dlegendre!(Val(:full), $dPm, 0.4, $Pm)
BenchmarkTools.Trial: 574 samples with 1 evaluation per sample.
Range (min β¦ max): 8.503 ms β¦ 10.539 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 8.602 ms β GC (median): 0.00%
Time (mean Β± Ο): 8.719 ms Β± 290.209 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
ββββββββββ
ββββββββββββββββββββ
β
βββββββββ
β
βββββ
β
β
βββββββββ
ββ
ββββββββββ β
8.5 ms Histogram: log(frequency) by time 9.85 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark dlegendre!(Val(:full), $dP, 0.4, $P)
BenchmarkTools.Trial: 1172 samples with 1 evaluation per sample.
Range (min β¦ max): 4.156 ms β¦ 5.287 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 4.253 ms β GC (median): 0.00%
Time (mean Β± Ο): 4.267 ms Β± 83.894 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
βββββ
ββ
ββββββ
βββββββββββ
β
ββββββββββββββββββββββββββββββββββββββββ β
4.16 ms Histogram: frequency by time 4.65 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
However, for small matrices, the new implementation leads to a performance degradation:
julia> using SatelliteToolboxLegendre, BenchmarkTools
julia> P = LowerTriangularStorage{RowMajor}(20);
julia> dP = LowerTriangularStorage{RowMajor}(20);
julia> Pm = zeros(20, 20);
julia> dPm = zeros(20, 20);
julia> @benchmark dlegendre!(Val(:full), $dPm, 0.4, $Pm)
BenchmarkTools.Trial: 10000 samples with 285 evaluations per sample.
Range (min β¦ max): 282.600 ns β¦ 514.326 ns β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 283.042 ns β GC (median): 0.00%
Time (mean Β± Ο): 284.933 ns Β± 7.142 ns β GC (mean Β± Ο): 0.00% Β± 0.00%
βββ ββ β
ββββββββ
β
β
ββββββββββββββββββ
βββββ
βββ
ββ
β
β
β
ββββββ
βββ
β
βββ
β
β
β
ββ
β
β β
283 ns Histogram: log(frequency) by time 316 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark dlegendre!(Val(:full), $dP, 0.4, $P)
BenchmarkTools.Trial: 10000 samples with 214 evaluations per sample.
Range (min β¦ max): 349.103 ns β¦ 748.051 ns β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 351.827 ns β GC (median): 0.00%
Time (mean Β± Ο): 356.279 ns Β± 13.982 ns β GC (mean Β± Ο): 0.00% Β± 0.00%
β
β
ββ
β ββββ β
ββββββββ
β
β
ββ
βββββββββββββββββββββββββββββββββ
βββ
ββββ
β
ββ
ββββββ
β
349 ns Histogram: log(frequency) by time 408 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
I am wondering if there is anything I can do to avoid this performance degradation on small matrices so that I can use this new storage type by default.