Accelerating linear algebra code and getting different results

Leticia-maria · November 23, 2023, 4:07pm

Hello, so, I have been trying accelerating this function called overlap by using broadcasting and dot notation instead of unpacking matrices elements using for loops. However, I am getting different results when I do that. I would appreciate help if someone has an alternative for this acceleration or has any clue where my math is going wrong:

1. Auxiliary functions

** overlap_2() is the accelerated function; it uses normalization_2(). distance() and doublefactorial() are the same in both cases.


abstract type AbstractBasisSet end

struct GaussianBasisSet <: AbstractBasisSet
    R
    α::Matrix{Float64}
    d::Matrix{Float64}
    ℓ::Int
    m::Int
    n::Int
end

function doublefactorial(number)
    fact = foldl(Base.:*, range(number, 1, step=-2))

    return fact
end

function normalization_2(α, ℓ, m, n)
    N = (4 .* α).^(ℓ + m + n)
    N /=
        doublefactorial(2 * ℓ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N .*= ((2 .* α) ./ π).^(3 / 2)
    N = sqrt.(N)

    return N
end

function distance(Rᵢ, Rⱼ)
    d = (Rᵢ[1] - Rⱼ[1])^2 + (Rᵢ[2] - Rⱼ[2])^2 + (Rᵢ[3] - Rⱼ[3])^2

    return d
end

function normalization(α, ℓ, m, n)
    N = (4 * α)^(ℓ + m + n)
    N /=
        doublefactorial(2 * ℓ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N *= ((2 * α) / π)^(3 / 2)
    N = sqrt(N)

    return N
end

2. The overlap functions

function overlap(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisᵢ = basis[i]
        basisⱼ = basis[j]
        
        Rᵢ = basisᵢ.R
        Rⱼ = basisⱼ.R

        dist = distance(Rᵢ, Rⱼ)

        m = length(basisᵢ.α)
        p = length(basisⱼ.α)

        for k in 1:m, l in 1:p
            αᵢ = basisᵢ.α[k]
            αⱼ = basisⱼ.α[l]

            dᵢ = basisᵢ.d[k]
            dⱼ = basisⱼ.d[l]

            ℓᵢ, mᵢ, nᵢ = basisᵢ.ℓ, basisᵢ.m, basisᵢ.n
            ℓⱼ, mⱼ, nⱼ = basisⱼ.ℓ, basisⱼ.m, basisⱼ.n
            
            a = (
                exp(-αᵢ * αⱼ * dist / (αᵢ + αⱼ)) *
                normalization(αᵢ, ℓᵢ, mᵢ, nᵢ) *
                normalization(αⱼ, ℓⱼ, mⱼ, nⱼ) *
                dᵢ *
                dⱼ )

            println("$i, $j")

            S[i, j] += (
                exp(-αᵢ * αⱼ * dist / (αᵢ + αⱼ)) *
                normalization(αᵢ, ℓᵢ, mᵢ, nᵢ) *
                normalization(αⱼ, ℓⱼ, mⱼ, nⱼ) *
                dᵢ *
                dⱼ
            )
        end
    end

    return S
end

function overlap_2(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisᵢ = basis[i]
        basisⱼ = basis[j]
        
        Rᵢ = basisᵢ.R
        Rⱼ = basisⱼ.R

        dist = distance(Rᵢ, Rⱼ)

        αᵢ = basisᵢ.α
        αⱼ = basisⱼ.α

        println("$i, $j")

        dᵢ = basisᵢ.d
        dⱼ = basisⱼ.d

        ℓᵢ, mᵢ, nᵢ = basisᵢ.ℓ, basisᵢ.m, basisᵢ.n
        ℓⱼ, mⱼ, nⱼ = basisⱼ.ℓ, basisⱼ.m, basisⱼ.n

        a = exp.(-αᵢ .* αⱼ .* dist ./ (αᵢ .+ αⱼ)) .*
        normalization_2(αᵢ, ℓᵢ, mᵢ, nᵢ) .*
        normalization_2(αⱼ, ℓⱼ, mⱼ, nⱼ) .*
        dᵢ .* dⱼ

        println(a)

        S[i, j] = sum(exp.(-αᵢ .* αⱼ .* dist ./ (αᵢ .+ αⱼ)) .* normalization_2(αᵢ, ℓᵢ, mᵢ, nᵢ) .* normalization_2(αⱼ, ℓⱼ, mⱼ, nⱼ) .* dᵢ .* dⱼ)
    end

    return S
end

Inputs

*** if you want to reproduce the calculations:

GaussianBasisSet[GaussianBasisSet([0.0 0.0 0.85], [3.425250914 0.6239137298 0.168855404], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [207.015607 37.70815124 10.20529731], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [-0.09996722919 0.3995128261 0.7001154689], 0, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 1, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 1, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 0, 1)]

roflmaostc · November 23, 2023, 4:33pm

Hi,

changing the code to, fixes it:

The thing is, before you have the double for loop over k and l. So you combine each α[k] with each α[l]. Is that what you are trying to do? Or do you want to do only elementwise operations?

The way you define α it looks like a vector, but you store them as a matrix. Is this intentional? As a 1xN vector?

julia> basis[1].α
1×3 Matrix{Float64}:
 3.42525  0.623914  0.168855

But if we broadcast it with the transpose the second time, we can reproduce the first result.

abstract type AbstractBasisSet end

struct GaussianBasisSet <: AbstractBasisSet
    R
    α::Matrix{Float64}
    d::Matrix{Float64}
    ℓ::Int
    m::Int
    n::Int
end

function doublefactorial(number)
    fact = foldl(Base.:*, range(number, 1, step=-2))

    return fact
end

function distance(Rᵢ, Rⱼ)
    d = (Rᵢ[1] - Rⱼ[1])^2 + (Rᵢ[2] - Rⱼ[2])^2 + (Rᵢ[3] - Rⱼ[3])^2

    return d
end

function normalization(α, ℓ, m, n)
    N = (4 * α)^(ℓ + m + n)
    N /=
        doublefactorial(2 * ℓ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N *= ((2 * α) / π)^(3 / 2)
    N = sqrt(N)

    return N
end



function overlap(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisᵢ = basis[i]
        basisⱼ = basis[j]
        
        Rᵢ = basisᵢ.R
        Rⱼ = basisⱼ.R

        dist = distance(Rᵢ, Rⱼ)

        m = length(basisᵢ.α)
        p = length(basisⱼ.α)

        for k in 1:m, l in 1:p
            αᵢ = basisᵢ.α[k]
            αⱼ = basisⱼ.α[l]

            dᵢ = basisᵢ.d[k]
            dⱼ = basisⱼ.d[l]

            ℓᵢ, mᵢ, nᵢ = basisᵢ.ℓ, basisᵢ.m, basisᵢ.n
            ℓⱼ, mⱼ, nⱼ = basisⱼ.ℓ, basisⱼ.m, basisⱼ.n
            
            a = (
                exp(-αᵢ * αⱼ * dist / (αᵢ + αⱼ)) *
                normalization(αᵢ, ℓᵢ, mᵢ, nᵢ) *
                normalization(αⱼ, ℓⱼ, mⱼ, nⱼ) *
                dᵢ *
                dⱼ )


            S[i, j] += (
                exp(-αᵢ * αⱼ * dist / (αᵢ + αⱼ)) *
                normalization(αᵢ, ℓᵢ, mᵢ, nᵢ) *
                normalization(αⱼ, ℓⱼ, mⱼ, nⱼ) *
                dᵢ *
                dⱼ
            )
        end
    end

    return S
end

function overlap_2(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisᵢ = basis[i]
        basisⱼ = basis[j]
        
        # note the transpose for each of the j!
        Rᵢ = basisᵢ.R
        Rⱼ = basisⱼ.R'

        dist = distance(Rᵢ, Rⱼ)

        αᵢ = basisᵢ.α
        αⱼ = basisⱼ.α'


        dᵢ = basisᵢ.d
        dⱼ = basisⱼ.d'

        ℓᵢ, mᵢ, nᵢ = basisᵢ.ℓ, basisᵢ.m, basisᵢ.n
        ℓⱼ, mⱼ, nⱼ = basisⱼ.ℓ', basisⱼ.m', basisⱼ.n'

        # I used the same normalization but applied broadcasting
        a = exp.(-αᵢ .* αⱼ .* dist ./ (αᵢ .+ αⱼ)) .*
        normalization.(αᵢ, ℓᵢ, mᵢ, nᵢ) .*
        normalization.(αⱼ, ℓⱼ, mⱼ, nⱼ) .*
        dᵢ .* dⱼ


        S[i, j] = sum(exp.(-αᵢ .* αⱼ .* dist ./ (αᵢ .+ αⱼ)) .* normalization.(αᵢ, ℓᵢ, mᵢ, nᵢ) .* normalization.(αⱼ, ℓⱼ, mⱼ, nⱼ) .* dᵢ .* dⱼ)
    end

    return S
end

@show overlap(basis) ≈ overlap_2(basis)

Apart from that, for loops are perfectly fine in Julia and broadcasting usually doesn’t make it faster but more conventient. You can use LoopVectorization.jl or Tullio.jl to achieve significant speed-up because of threading, etc. though.

lmiq · November 24, 2023, 12:15am

What’s R here? Why is it not typed? That can be a major source of slowdown, depending on where it is used.

Edit: it should definitely be a SVector{3, Float64}. And then your distance function can be simply norm(x - y).

edit:

Just by defining:

struct GaussianBasisSet2 <: AbstractBasisSet
    R::SVector{3,Float64}
    α::Matrix{Float64}
    d::Matrix{Float64}
    ℓ::Int
    m::Int
    n::Int
end

You get from:

julia> @btime overlap($b)
  149.097 μs (7165 allocations: 112.30 KiB)

to:

julia> @btime overlap($b2)
  67.895 μs (1 allocation: 368 bytes)

Then you don’t seem to be using a for anything (probably remaining of some debugging). Removing that one gets:

julia> @btime overlap($b2)
  38.548 μs (1 allocation: 368 bytes)

Then you can parallelize that by iterating over the cartesian indices of S, and get (here with 4 cores):

julia> @btime overlap($b2)
  12.297 μs (42 allocations: 4.59 KiB)

This is the code:

using BenchmarkTools
using Base.Threads
using StaticArrays
using LinearAlgebra: norm

abstract type AbstractBasisSet end

struct GaussianBasisSet2 <: AbstractBasisSet
    R::SVector{3,Float64}
    α::Matrix{Float64}
    d::Matrix{Float64}
    ℓ::Int
    m::Int
    n::Int
end

function doublefactorial(number)
    fact = foldl(Base.:*, range(number, 1, step=-2))
    return fact
end

function normalization(α, ℓ, m, n)
    N = (4 * α)^(ℓ + m + n)
    N /= doublefactorial(2 * ℓ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N *= ((2 * α) / π)^(3 / 2)
    N = sqrt(N)
    return N
end

function overlap(basis)
    n = length(basis)
    S = zeros(n, n)

    @threads for c in CartesianIndices(S)
        i, j = c[1], c[2]
        basisᵢ = basis[i]
        basisⱼ = basis[j]
        
        Rᵢ = basisᵢ.R
        Rⱼ = basisⱼ.R

        dist = norm(Rᵢ - Rⱼ)

        m = length(basisᵢ.α)
        p = length(basisⱼ.α)

        s_aux = zero(eltype(S))
        for k in 1:m, l in 1:p
            αᵢ = basisᵢ.α[k]
            αⱼ = basisⱼ.α[l]

            dᵢ = basisᵢ.d[k]
            dⱼ = basisⱼ.d[l]

            ℓᵢ, mᵢ, nᵢ = basisᵢ.ℓ, basisᵢ.m, basisᵢ.n
            ℓⱼ, mⱼ, nⱼ = basisⱼ.ℓ, basisⱼ.m, basisⱼ.n
        
            s_aux += (
                exp(-αᵢ * αⱼ * dist / (αᵢ + αⱼ)) *
                normalization(αᵢ, ℓᵢ, mᵢ, nᵢ) *
                normalization(αⱼ, ℓⱼ, mⱼ, nⱼ) *
                dᵢ *
                dⱼ
            )
        end
        S[i, j] += s_aux
    end

    return S
end

b2 = GaussianBasisSet2[
    GaussianBasisSet2([0.0 0.0 0.85], [3.425250914 0.6239137298 0.168855404], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [207.015607 37.70815124 10.20529731], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [-0.09996722919 0.3995128261 0.7001154689], 0, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 1, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 1, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 0, 1)
]

overlap(b2)

@btime overlap($b2)

You can also use Polyester: @batch instead of @threads there to get less allocations and, I think, a small additional speedup.

lmiq · November 24, 2023, 12:19am

I think you can get some additional speedup following @roflmaostc advice and using static vectors for the parameters of the basis. For that, you an use:

struct GaussianBasisSet3{N} <: AbstractBasisSet
    R::SVector{3,Float64}
    α::SVector{N,Float64}
    d::SVector{N,Float64}
    ℓ::Int
    m::Int
    n::Int
end
function GaussianBasisSet3(R::T,α::T,d::T,ℓ,m,n) where {T<:Vector{Float64}}
    N = length(R)
    return GaussianBasisSet3(
        SVector{N, Float64}(R),
        SVector{N, Float64}(α),
        SVector{N, Float64}(d),
        ℓ,m,n
    )
end

and initialize the basis with:

b3 = [
    GaussianBasisSet3(
        [0.0, 0.0, 0.85],
        [3.425250914, 0.6239137298, 0.168855404],
        [0.1543289673, 0.5353281423, 0.4446345422], 0, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [207.015607, 37.70815124, 10.20529731],
        [0.1543289673, 0.5353281423, 0.4446345422], 0, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [-0.09996722919, 0.3995128261, 0.7001154689], 0, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [0.155916275, 0.6076837186, 0.3919573931], 1, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [0.155916275, 0.6076837186, 0.3919573931], 0, 1, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [0.155916275, 0.6076837186, 0.3919573931], 0, 0, 1
    )
]

with which I get:

julia> @btime overlap($b3)
  9.779 μs (1 allocation: 368 bytes)

But if each gaussian basis is of different size, probably you are better of using normal Vector{Float64} for the parameters, because otherwise the b3 will be an array of different types.

And with some rearragement of the operations to avoid computing things repeatedly inside the loops, you can get:

julia> @btime overlap($b2)
  7.437 μs (2 allocations: 416 bytes)

Code:

function overlap(basis)
    n = length(basis)
    S = zeros(n, n)
    @batch for c in CartesianIndices(S)
        i, j = c[1], c[2]
        basisᵢ = basis[i]
        basisⱼ = basis[j]
        Rᵢ = basisᵢ.R
        Rⱼ = basisⱼ.R
        dist = norm(Rᵢ - Rⱼ)
        m = length(basisᵢ.α)
        p = length(basisⱼ.α)
        ℓᵢ, mᵢ, nᵢ = basisᵢ.ℓ, basisᵢ.m, basisᵢ.n
        ℓⱼ, mⱼ, nⱼ = basisⱼ.ℓ, basisⱼ.m, basisⱼ.n
        s_aux = zero(eltype(S))
        for k in 1:m
            αᵢ = basisᵢ.α[k]
            dᵢ = basisᵢ.d[k]
            norm_i = normalization(αᵢ, ℓᵢ, mᵢ, nᵢ)
            for l in 1:p
                αⱼ = basisⱼ.α[l]
                dⱼ = basisⱼ.d[l]
                s_aux += (
                    exp(-αᵢ * αⱼ * dist / (αᵢ + αⱼ)) *
                    norm_i * normalization(αⱼ, ℓⱼ, mⱼ, nⱼ) *
                    dᵢ * dⱼ
                )
            end
        end
        S[i, j] += s_aux
    end
    return S
end

Leticia-maria · November 24, 2023, 3:54pm

Thank you so much! I appreciate your suggestions, the question I have is regarding to your point that I should change my distance function to norm(x -y). I have compared the cost, and my distance function is way faster than doing norm(x-y):

julia> @benchmark norm(a-b)
BenchmarkTools.Trial: 10000 samples with 923 evaluations.
 Range (min … max):  111.051 ns …  37.788 μs  ┊ GC (min … max): 0.00% … 99.58%
 Time  (median):     125.948 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   132.991 ns ± 432.356 ns  ┊ GC (mean ± σ):  4.41% ±  1.41%

         ▂▂▂▃▄▅▇█▇▇▆▄▂▂
  ▁▂▄▄▆▆▇███████████████▇▆▄▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▃
  111 ns           Histogram: frequency by time          173 ns <

 Memory estimate: 96 bytes, allocs estimate: 2.

julia> @benchmark distance()
distance(Rᵢ, Rⱼ) @ Main REPL[20]:1
julia> @benchmark distance(a,b)
BenchmarkTools.Trial: 10000 samples with 992 evaluations.
 Range (min … max):  40.028 ns … 137.475 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     40.197 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   40.906 ns ±   2.987 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅█▅▂             ▄▆▇▃▁                                       ▂
  █████▃▁▁▃▁▄▃▁▁▁▁▅█████▅▆▆▅▃▆▅▆▅▄▆▇▆▇█████▆▇▆▁▁▁▃▄▁▁▄▃▃▁▁▃▇▇▄ █
  40 ns         Histogram: log(frequency) by time      44.3 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Am I doing something wrong in my benchmarking?

lmiq · November 24, 2023, 3:58pm

You’re probably not using static arrays there, and thus allocating the intermediate vector of the difference.

But norm won’t be faster anyway, it is just about style.

Anyway, the proper way to benchmark that is probably something like this:

julia> d(x,y) = sqrt((x[1]-y[1])^2 + (x[2]-y[2])^2 + (x[3]-y[3])^2)
d (generic function with 1 method)

julia> @btime d(x,y) setup=(x=rand(SVector{3}); y = rand(SVector{3})) evals=1
  33.000 ns (0 allocations: 0 bytes)
0.47363599375524723

julia> d2(x,y) = norm(x - y)
d2 (generic function with 1 method)

julia> @btime d2(x,y) setup=(x=rand(SVector{3}); y = rand(SVector{3})) evals=1
  37.000 ns (0 allocations: 0 bytes)
0.3505746671334738

Topic		Replies	Views
Matrix multiplication Performance	6	538	November 17, 2020
Alternative permutation/reducing for loops Performance question , package , linearalgebra	7	328	September 7, 2023
Speed up applying a function to matrices Performance	8	620	August 13, 2021
Specialized matrix-matrix multiplication algorithm New to Julia question , performance , linearalgebra	5	418	July 9, 2024
Accelerating an N^2 nested loop Performance optimization	13	417	January 23, 2023

Accelerating linear algebra code and getting different results

1. Auxiliary functions

2. The overlap functions

Inputs

Related topics