Accelerating linear algebra code and getting different results

Hello, so, I have been trying accelerating this function called overlap by using broadcasting and dot notation instead of unpacking matrices elements using for loops. However, I am getting different results when I do that. I would appreciate help if someone has an alternative for this acceleration or has any clue where my math is going wrong:

1. Auxiliary functions

** overlap_2() is the accelerated function; it uses normalization_2(). distance() and doublefactorial() are the same in both cases.


abstract type AbstractBasisSet end

struct GaussianBasisSet <: AbstractBasisSet
    R
    Ī±::Matrix{Float64}
    d::Matrix{Float64}
    ā„“::Int
    m::Int
    n::Int
end

function doublefactorial(number)
    fact = foldl(Base.:*, range(number, 1, step=-2))

    return fact
end

function normalization_2(Ī±, ā„“, m, n)
    N = (4 .* Ī±).^(ā„“ + m + n)
    N /=
        doublefactorial(2 * ā„“ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N .*= ((2 .* Ī±) ./ Ļ€).^(3 / 2)
    N = sqrt.(N)

    return N
end

function distance(Rįµ¢, Rā±¼)
    d = (Rįµ¢[1] - Rā±¼[1])^2 + (Rįµ¢[2] - Rā±¼[2])^2 + (Rįµ¢[3] - Rā±¼[3])^2

    return d
end

function normalization(Ī±, ā„“, m, n)
    N = (4 * Ī±)^(ā„“ + m + n)
    N /=
        doublefactorial(2 * ā„“ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N *= ((2 * Ī±) / Ļ€)^(3 / 2)
    N = sqrt(N)

    return N
end

2. The overlap functions

function overlap(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisįµ¢ = basis[i]
        basisā±¼ = basis[j]
        
        Rįµ¢ = basisįµ¢.R
        Rā±¼ = basisā±¼.R

        dist = distance(Rįµ¢, Rā±¼)

        m = length(basisįµ¢.Ī±)
        p = length(basisā±¼.Ī±)

        for k in 1:m, l in 1:p
            Ī±įµ¢ = basisįµ¢.Ī±[k]
            Ī±ā±¼ = basisā±¼.Ī±[l]

            dįµ¢ = basisįµ¢.d[k]
            dā±¼ = basisā±¼.d[l]

            ā„“įµ¢, mįµ¢, nįµ¢ = basisįµ¢.ā„“, basisįµ¢.m, basisįµ¢.n
            ā„“ā±¼, mā±¼, nā±¼ = basisā±¼.ā„“, basisā±¼.m, basisā±¼.n
            
            a = (
                exp(-Ī±įµ¢ * Ī±ā±¼ * dist / (Ī±įµ¢ + Ī±ā±¼)) *
                normalization(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) *
                normalization(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) *
                dįµ¢ *
                dā±¼ )

            println("$i, $j")

            S[i, j] += (
                exp(-Ī±įµ¢ * Ī±ā±¼ * dist / (Ī±įµ¢ + Ī±ā±¼)) *
                normalization(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) *
                normalization(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) *
                dįµ¢ *
                dā±¼
            )
        end
    end

    return S
end

function overlap_2(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisįµ¢ = basis[i]
        basisā±¼ = basis[j]
        
        Rįµ¢ = basisįµ¢.R
        Rā±¼ = basisā±¼.R

        dist = distance(Rįµ¢, Rā±¼)

        Ī±įµ¢ = basisįµ¢.Ī±
        Ī±ā±¼ = basisā±¼.Ī±

        println("$i, $j")

        dįµ¢ = basisįµ¢.d
        dā±¼ = basisā±¼.d

        ā„“įµ¢, mįµ¢, nįµ¢ = basisįµ¢.ā„“, basisįµ¢.m, basisįµ¢.n
        ā„“ā±¼, mā±¼, nā±¼ = basisā±¼.ā„“, basisā±¼.m, basisā±¼.n

        a = exp.(-Ī±įµ¢ .* Ī±ā±¼ .* dist ./ (Ī±įµ¢ .+ Ī±ā±¼)) .*
        normalization_2(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) .*
        normalization_2(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) .*
        dįµ¢ .* dā±¼

        println(a)

        S[i, j] = sum(exp.(-Ī±įµ¢ .* Ī±ā±¼ .* dist ./ (Ī±įµ¢ .+ Ī±ā±¼)) .* normalization_2(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) .* normalization_2(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) .* dįµ¢ .* dā±¼)
    end

    return S
end

Inputs

*** if you want to reproduce the calculations:

GaussianBasisSet[GaussianBasisSet([0.0 0.0 0.85], [3.425250914 0.6239137298 0.168855404], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [207.015607 37.70815124 10.20529731], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [-0.09996722919 0.3995128261 0.7001154689], 0, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 1, 0, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 1, 0), GaussianBasisSet([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 0, 1)]

Hi,

changing the code to, fixes it:

The thing is, before you have the double for loop over k and l. So you combine each Ī±[k] with each Ī±[l]. Is that what you are trying to do? Or do you want to do only elementwise operations?

The way you define Ī± it looks like a vector, but you store them as a matrix. Is this intentional? As a 1xN vector?

julia> basis[1].Ī±
1Ɨ3 Matrix{Float64}:
 3.42525  0.623914  0.168855

But if we broadcast it with the transpose the second time, we can reproduce the first result.

abstract type AbstractBasisSet end

struct GaussianBasisSet <: AbstractBasisSet
    R
    Ī±::Matrix{Float64}
    d::Matrix{Float64}
    ā„“::Int
    m::Int
    n::Int
end

function doublefactorial(number)
    fact = foldl(Base.:*, range(number, 1, step=-2))

    return fact
end

function distance(Rįµ¢, Rā±¼)
    d = (Rįµ¢[1] - Rā±¼[1])^2 + (Rįµ¢[2] - Rā±¼[2])^2 + (Rįµ¢[3] - Rā±¼[3])^2

    return d
end

function normalization(Ī±, ā„“, m, n)
    N = (4 * Ī±)^(ā„“ + m + n)
    N /=
        doublefactorial(2 * ā„“ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N *= ((2 * Ī±) / Ļ€)^(3 / 2)
    N = sqrt(N)

    return N
end



function overlap(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisįµ¢ = basis[i]
        basisā±¼ = basis[j]
        
        Rįµ¢ = basisįµ¢.R
        Rā±¼ = basisā±¼.R

        dist = distance(Rįµ¢, Rā±¼)

        m = length(basisįµ¢.Ī±)
        p = length(basisā±¼.Ī±)

        for k in 1:m, l in 1:p
            Ī±įµ¢ = basisįµ¢.Ī±[k]
            Ī±ā±¼ = basisā±¼.Ī±[l]

            dįµ¢ = basisįµ¢.d[k]
            dā±¼ = basisā±¼.d[l]

            ā„“įµ¢, mįµ¢, nįµ¢ = basisįµ¢.ā„“, basisįµ¢.m, basisįµ¢.n
            ā„“ā±¼, mā±¼, nā±¼ = basisā±¼.ā„“, basisā±¼.m, basisā±¼.n
            
            a = (
                exp(-Ī±įµ¢ * Ī±ā±¼ * dist / (Ī±įµ¢ + Ī±ā±¼)) *
                normalization(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) *
                normalization(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) *
                dįµ¢ *
                dā±¼ )


            S[i, j] += (
                exp(-Ī±įµ¢ * Ī±ā±¼ * dist / (Ī±įµ¢ + Ī±ā±¼)) *
                normalization(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) *
                normalization(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) *
                dįµ¢ *
                dā±¼
            )
        end
    end

    return S
end

function overlap_2(basis)
    n = length(basis)
    S = zeros(n, n)

    for i in 1:n, j in 1:n
        basisįµ¢ = basis[i]
        basisā±¼ = basis[j]
        
        # note the transpose for each of the j!
        Rįµ¢ = basisįµ¢.R
        Rā±¼ = basisā±¼.R'

        dist = distance(Rįµ¢, Rā±¼)

        Ī±įµ¢ = basisįµ¢.Ī±
        Ī±ā±¼ = basisā±¼.Ī±'


        dįµ¢ = basisįµ¢.d
        dā±¼ = basisā±¼.d'

        ā„“įµ¢, mįµ¢, nįµ¢ = basisįµ¢.ā„“, basisįµ¢.m, basisįµ¢.n
        ā„“ā±¼, mā±¼, nā±¼ = basisā±¼.ā„“', basisā±¼.m', basisā±¼.n'

        # I used the same normalization but applied broadcasting
        a = exp.(-Ī±įµ¢ .* Ī±ā±¼ .* dist ./ (Ī±įµ¢ .+ Ī±ā±¼)) .*
        normalization.(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) .*
        normalization.(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) .*
        dįµ¢ .* dā±¼


        S[i, j] = sum(exp.(-Ī±įµ¢ .* Ī±ā±¼ .* dist ./ (Ī±įµ¢ .+ Ī±ā±¼)) .* normalization.(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) .* normalization.(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) .* dįµ¢ .* dā±¼)
    end

    return S
end

@show overlap(basis) ā‰ˆ overlap_2(basis)

Apart from that, for loops are perfectly fine in Julia and broadcasting usually doesnā€™t make it faster but more conventient. You can use LoopVectorization.jl or Tullio.jl to achieve significant speed-up because of threading, etc. though.

1 Like

Whatā€™s R here? Why is it not typed? That can be a major source of slowdown, depending on where it is used.

Edit: it should definitely be a SVector{3, Float64}. And then your distance function can be simply norm(x - y).

edit:

Just by defining:

struct GaussianBasisSet2 <: AbstractBasisSet
    R::SVector{3,Float64}
    Ī±::Matrix{Float64}
    d::Matrix{Float64}
    ā„“::Int
    m::Int
    n::Int
end

You get from:

julia> @btime overlap($b)
  149.097 Ī¼s (7165 allocations: 112.30 KiB)

to:

julia> @btime overlap($b2)
  67.895 Ī¼s (1 allocation: 368 bytes)

Then you donā€™t seem to be using a for anything (probably remaining of some debugging). Removing that one gets:

julia> @btime overlap($b2)
  38.548 Ī¼s (1 allocation: 368 bytes)

Then you can parallelize that by iterating over the cartesian indices of S, and get (here with 4 cores):

julia> @btime overlap($b2)
  12.297 Ī¼s (42 allocations: 4.59 KiB)

This is the code:

using BenchmarkTools
using Base.Threads
using StaticArrays
using LinearAlgebra: norm

abstract type AbstractBasisSet end

struct GaussianBasisSet2 <: AbstractBasisSet
    R::SVector{3,Float64}
    Ī±::Matrix{Float64}
    d::Matrix{Float64}
    ā„“::Int
    m::Int
    n::Int
end

function doublefactorial(number)
    fact = foldl(Base.:*, range(number, 1, step=-2))
    return fact
end

function normalization(Ī±, ā„“, m, n)
    N = (4 * Ī±)^(ā„“ + m + n)
    N /= doublefactorial(2 * ā„“ - 1) * doublefactorial(2 * m - 1) * doublefactorial(2 * n - 1)
    N *= ((2 * Ī±) / Ļ€)^(3 / 2)
    N = sqrt(N)
    return N
end

function overlap(basis)
    n = length(basis)
    S = zeros(n, n)

    @threads for c in CartesianIndices(S)
        i, j = c[1], c[2]
        basisįµ¢ = basis[i]
        basisā±¼ = basis[j]
        
        Rįµ¢ = basisįµ¢.R
        Rā±¼ = basisā±¼.R

        dist = norm(Rįµ¢ - Rā±¼)

        m = length(basisįµ¢.Ī±)
        p = length(basisā±¼.Ī±)

        s_aux = zero(eltype(S))
        for k in 1:m, l in 1:p
            Ī±įµ¢ = basisįµ¢.Ī±[k]
            Ī±ā±¼ = basisā±¼.Ī±[l]

            dįµ¢ = basisįµ¢.d[k]
            dā±¼ = basisā±¼.d[l]

            ā„“įµ¢, mįµ¢, nįµ¢ = basisįµ¢.ā„“, basisįµ¢.m, basisįµ¢.n
            ā„“ā±¼, mā±¼, nā±¼ = basisā±¼.ā„“, basisā±¼.m, basisā±¼.n
        
            s_aux += (
                exp(-Ī±įµ¢ * Ī±ā±¼ * dist / (Ī±įµ¢ + Ī±ā±¼)) *
                normalization(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢) *
                normalization(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) *
                dįµ¢ *
                dā±¼
            )
        end
        S[i, j] += s_aux
    end

    return S
end

b2 = GaussianBasisSet2[
    GaussianBasisSet2([0.0 0.0 0.85], [3.425250914 0.6239137298 0.168855404], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [207.015607 37.70815124 10.20529731], [0.1543289673 0.5353281423 0.4446345422], 0, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [-0.09996722919 0.3995128261 0.7001154689], 0, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 1, 0, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 1, 0), 
    GaussianBasisSet2([0.0 0.0 -0.85], [8.24631512 1.916266291 0.6232292721], [0.155916275 0.6076837186 0.3919573931], 0, 0, 1)
]

overlap(b2)

@btime overlap($b2)

You can also use Polyester: @batch instead of @threads there to get less allocations and, I think, a small additional speedup.

1 Like

I think you can get some additional speedup following @roflmaostc advice and using static vectors for the parameters of the basis. For that, you an use:

struct GaussianBasisSet3{N} <: AbstractBasisSet
    R::SVector{3,Float64}
    Ī±::SVector{N,Float64}
    d::SVector{N,Float64}
    ā„“::Int
    m::Int
    n::Int
end
function GaussianBasisSet3(R::T,Ī±::T,d::T,ā„“,m,n) where {T<:Vector{Float64}}
    N = length(R)
    return GaussianBasisSet3(
        SVector{N, Float64}(R),
        SVector{N, Float64}(Ī±),
        SVector{N, Float64}(d),
        ā„“,m,n
    )
end

and initialize the basis with:

b3 = [
    GaussianBasisSet3(
        [0.0, 0.0, 0.85],
        [3.425250914, 0.6239137298, 0.168855404],
        [0.1543289673, 0.5353281423, 0.4446345422], 0, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [207.015607, 37.70815124, 10.20529731],
        [0.1543289673, 0.5353281423, 0.4446345422], 0, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [-0.09996722919, 0.3995128261, 0.7001154689], 0, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [0.155916275, 0.6076837186, 0.3919573931], 1, 0, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [0.155916275, 0.6076837186, 0.3919573931], 0, 1, 0
    ),
    GaussianBasisSet3(
        [0.0, 0.0, -0.85],
        [8.24631512, 1.916266291, 0.6232292721],
        [0.155916275, 0.6076837186, 0.3919573931], 0, 0, 1
    )
]

with which I get:

julia> @btime overlap($b3)
  9.779 Ī¼s (1 allocation: 368 bytes)

But if each gaussian basis is of different size, probably you are better of using normal Vector{Float64} for the parameters, because otherwise the b3 will be an array of different types.

And with some rearragement of the operations to avoid computing things repeatedly inside the loops, you can get:

julia> @btime overlap($b2)
  7.437 Ī¼s (2 allocations: 416 bytes)

Code:

function overlap(basis)
    n = length(basis)
    S = zeros(n, n)
    @batch for c in CartesianIndices(S)
        i, j = c[1], c[2]
        basisįµ¢ = basis[i]
        basisā±¼ = basis[j]
        Rįµ¢ = basisįµ¢.R
        Rā±¼ = basisā±¼.R
        dist = norm(Rįµ¢ - Rā±¼)
        m = length(basisįµ¢.Ī±)
        p = length(basisā±¼.Ī±)
        ā„“įµ¢, mįµ¢, nįµ¢ = basisįµ¢.ā„“, basisįµ¢.m, basisįµ¢.n
        ā„“ā±¼, mā±¼, nā±¼ = basisā±¼.ā„“, basisā±¼.m, basisā±¼.n
        s_aux = zero(eltype(S))
        for k in 1:m
            Ī±įµ¢ = basisįµ¢.Ī±[k]
            dįµ¢ = basisįµ¢.d[k]
            norm_i = normalization(Ī±įµ¢, ā„“įµ¢, mįµ¢, nįµ¢)
            for l in 1:p
                Ī±ā±¼ = basisā±¼.Ī±[l]
                dā±¼ = basisā±¼.d[l]
                s_aux += (
                    exp(-Ī±įµ¢ * Ī±ā±¼ * dist / (Ī±įµ¢ + Ī±ā±¼)) *
                    norm_i * normalization(Ī±ā±¼, ā„“ā±¼, mā±¼, nā±¼) *
                    dįµ¢ * dā±¼
                )
            end
        end
        S[i, j] += s_aux
    end
    return S
end
1 Like

Thank you so much! I appreciate your suggestions, the question I have is regarding to your point that I should change my distance function to norm(x -y). I have compared the cost, and my distance function is way faster than doing norm(x-y):

julia> @benchmark norm(a-b)
BenchmarkTools.Trial: 10000 samples with 923 evaluations.
 Range (min ā€¦ max):  111.051 ns ā€¦  37.788 Ī¼s  ā”Š GC (min ā€¦ max): 0.00% ā€¦ 99.58%
 Time  (median):     125.948 ns               ā”Š GC (median):    0.00%
 Time  (mean Ā± Ļƒ):   132.991 ns Ā± 432.356 ns  ā”Š GC (mean Ā± Ļƒ):  4.41% Ā±  1.41%

         ā–‚ā–‚ā–‚ā–ƒā–„ā–…ā–‡ā–ˆā–‡ā–‡ā–†ā–„ā–‚ā–‚
  ā–ā–‚ā–„ā–„ā–†ā–†ā–‡ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‡ā–†ā–„ā–„ā–ƒā–ƒā–‚ā–‚ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā–ā– ā–ƒ
  111 ns           Histogram: frequency by time          173 ns <

 Memory estimate: 96 bytes, allocs estimate: 2.

julia> @benchmark distance()
distance(Rįµ¢, Rā±¼) @ Main REPL[20]:1
julia> @benchmark distance(a,b)
BenchmarkTools.Trial: 10000 samples with 992 evaluations.
 Range (min ā€¦ max):  40.028 ns ā€¦ 137.475 ns  ā”Š GC (min ā€¦ max): 0.00% ā€¦ 0.00%
 Time  (median):     40.197 ns               ā”Š GC (median):    0.00%
 Time  (mean Ā± Ļƒ):   40.906 ns Ā±   2.987 ns  ā”Š GC (mean Ā± Ļƒ):  0.00% Ā± 0.00%

  ā–…ā–ˆā–…ā–‚             ā–„ā–†ā–‡ā–ƒā–                                       ā–‚
  ā–ˆā–ˆā–ˆā–ˆā–ˆā–ƒā–ā–ā–ƒā–ā–„ā–ƒā–ā–ā–ā–ā–…ā–ˆā–ˆā–ˆā–ˆā–ˆā–…ā–†ā–†ā–…ā–ƒā–†ā–…ā–†ā–…ā–„ā–†ā–‡ā–†ā–‡ā–ˆā–ˆā–ˆā–ˆā–ˆā–†ā–‡ā–†ā–ā–ā–ā–ƒā–„ā–ā–ā–„ā–ƒā–ƒā–ā–ā–ƒā–‡ā–‡ā–„ ā–ˆ
  40 ns         Histogram: log(frequency) by time      44.3 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Am I doing something wrong in my benchmarking?

Youā€™re probably not using static arrays there, and thus allocating the intermediate vector of the difference.

But norm wonā€™t be faster anyway, it is just about style.

Anyway, the proper way to benchmark that is probably something like this:

julia> d(x,y) = sqrt((x[1]-y[1])^2 + (x[2]-y[2])^2 + (x[3]-y[3])^2)
d (generic function with 1 method)

julia> @btime d(x,y) setup=(x=rand(SVector{3}); y = rand(SVector{3})) evals=1
  33.000 ns (0 allocations: 0 bytes)
0.47363599375524723

julia> d2(x,y) = norm(x - y)
d2 (generic function with 1 method)

julia> @btime d2(x,y) setup=(x=rand(SVector{3}); y = rand(SVector{3})) evals=1
  37.000 ns (0 allocations: 0 bytes)
0.3505746671334738
1 Like