Multigrid solver prototype (GMG) and simple Lid Cavity solver

LaurentPlagne · July 8, 2020, 3:10pm

… I still observe serious slowdown with CartesianIndexes in the 3D case:

function res_loop!(rlc::Array{Float64,3},rlf::Array{Float64,3})
    (nx,ny,nz) = size(rlc)
    @inbounds @simd for k=2:nz-1
        fk=2k-2
        for j=2:ny-1
            fj=2j-2
            for i=2:nx-1
                fi=2i-2
                rlc[i,j,k]=
                rlf[fi  ,fj  ,fk  ]+rlf[fi+1,fj  ,fk  ]+
                rlf[fi  ,fj+1,fk  ]+rlf[fi+1,fj+1,fk  ]+
                rlf[fi  ,fj  ,fk+1]+rlf[fi+1,fj  ,fk+1]+
                rlf[fi  ,fj+1,fk+1]+rlf[fi+1,fj+1,fk+1]               
            end
        end
    end
end

The following MWE returns:

Summary

using BenchmarkTools
using LinearAlgebra
using Random
using LoopVectorization
using Printf


@inline CI(a...) = CartesianIndex(a...)
@inline δ(a,d::Int) = CI(ntuple(i -> i==a ? 1 : 0, d))
@inline δ(a,I::CartesianIndex{N}) where {N} = δ(a,N)
@inline CR(a...) = CartesianIndices(a...)
@inline inside(M::NTuple{N,Int}) where {N} = CR(ntuple(i-> 2:M[i]-1,N))
@inline inside(a::Array; reverse::Bool=false) =
        reverse ? Iterators.reverse(inside(size(a))) : inside(size(a))
@inline inside_u(N::NTuple{n,T}) where {n,T} = CR(ntuple(i->2:N[i],n-1))
@inline near(I::CartesianIndex,a=0) = (2I-2oneunit(I)):(2I-oneunit(I)-δ(a,I))


function res_loop!(rlc::Array{Float64,2},rlf::Array{Float64,2})
    (nrows,ncols) = size(rlc)
    @inbounds @simd for j=2:ncols-1
        fj=2j-2
        for i=2:nrows-1
            fi=2i-2
             rlc[i,j]=(rlf[fi,fj]+rlf[fi+1,fj]+rlf[fi,fj+1]+rlf[fi+1,fj+1])
        end
    end
end

function res_loop!(rlc::Array{Float64,3},rlf::Array{Float64,3})
    (nx,ny,nz) = size(rlc)
    @inbounds @simd for k=2:nz-1
        fk=2k-2
        for j=2:ny-1
            fj=2j-2
            for i=2:nx-1
                fi=2i-2
                rlc[i,j,k]=
                rlf[fi  ,fj  ,fk  ]+rlf[fi+1,fj  ,fk  ]+
                rlf[fi  ,fj+1,fk  ]+rlf[fi+1,fj+1,fk  ]+
                rlf[fi  ,fj  ,fk+1]+rlf[fi+1,fj  ,fk+1]+
                rlf[fi  ,fj+1,fk+1]+rlf[fi+1,fj+1,fk+1]               
            end
        end
    end
end


function res_ci!(rlc,rlf)
    # @inside rlc[I] = sum(@inbounds(rlf[J]) for J ∈ near(I))
    @fastmath @inbounds @simd for I ∈ inside(rlc)
            rlc[I]=sum(@inbounds(rlf[J]) for J ∈ near(I))
    end
end


#function that call the benchmarks
function benchmark_res(rlf_size)
    # initialize two 2D arrays (fine and coarse)
    rlc_size =  rlf_size .>>> 1
    Random.seed!(1234);
    rlf=rand(Float64,(rlf_size))
    rlc=rand(Float64,(rlc_size))
    # the following blocks checks that res and res_ci produce same results
    rlc_ci=deepcopy(rlc)
    res_loop!(rlc,rlf)
    res_ci!(rlc_ci,rlf)
    # @show @code_native debuginfo=:none res!(rlc,rlf)
    # @show @code_native debuginfo=:none res_ci!(rlc,rlf)
    @assert norm(rlc-rlc_ci)≈0.0
    # #The actual timings
    tloop=@belapsed res_loop!($rlc,$rlf)
    tci=@belapsed res_ci!($rlc,$rlf)
    @printf("tloop=%.3e, tci=%.3e, slowdown=%.3f ,size=%s \n",tloop,tci,tci/tloop,size(rlf))
    # @show nf,ss(tloop),ss(tci),ss(tloop/tci)
    nothing
end


for n in [10,34,66,258]
    benchmark_res((n,n))
    benchmark_res((n,n,n))
end

returns

tloop=2.230e-08, tci=5.239e-08, slowdown=2.350 ,size=(10, 10) 
tloop=9.610e-08, tci=2.301e-07, slowdown=2.395 ,size=(10, 10, 10) 
tloop=1.687e-07, tci=1.039e-06, slowdown=6.161 ,size=(34, 34) 
tloop=5.703e-06, tci=2.601e-05, slowdown=4.561 ,size=(34, 34, 34) 
tloop=6.217e-07, tci=4.423e-06, slowdown=7.115 ,size=(66, 66) 
tloop=4.554e-05, tci=2.232e-04, slowdown=4.901 ,size=(66, 66, 66) 
tloop=1.174e-05, tci=7.279e-05, slowdown=6.202 ,size=(258, 258) 
tloop=7.787e-03, tci=1.898e-02, slowdown=2.438 ,size=(258, 258, 258)

weymouth · July 8, 2020, 3:17pm

No, my results don’t agree with yours on that. I copy-pasted your code and it’s slower than both Waterlily.restrict! and my res3D!(). I’m not using your MWE, but just running those single lines with @btime.

weymouth · July 8, 2020, 3:20pm

Wouldn’t that just make it less likely that the objects are close together in memory? I think using a multi-dimensional array which has been optimized by everyone else using them in Julia would be more robust.

I could switch over to using 1D array indexing. That is easy, but seems unlikely to be much faster. Maybe it’s worth trying though…

LaurentPlagne · July 8, 2020, 3:34pm

Well, I still don’t understand…

The res_ci! function I evaluate should be a perfect copy of Waterlily.restrict! and the timings are actually matching:

for your fine mesh size (66,66,66) case I obtain tci=223 us while you get 290 us (I guess you machine is a little slower).
But the 3D loop based res_loop! takes only 45 us which is 5x faster !

(apologize If I miss something obvious)

chakravala · July 8, 2020, 3:36pm

I don’t actually know if the allocation would be better one way or the other, but it’s the approach I want to work on and optimize for my purposes. I just like doing things differently haha

weymouth · July 8, 2020, 3:42pm

I dunno! I’m getting 310.999 μs (0 allocations: 0 bytes).

LaurentPlagne · July 8, 2020, 3:47pm

So this is consistent and much slower than res_loop!. Correct ?

weymouth · July 8, 2020, 3:58pm

That is my time for res_loop

LaurentPlagne · July 8, 2020, 4:54pm

OK, I get your point now: thank you for your patience !
Feel free to skip the following if you get bored by my questions.

Although the discrepancy worth an investigation.
My laptop is fine (i5-9300H CPU @ 2.40GHz × 8) but there should not be such a difference. I can reproduce your perfomances with CartesianIndexes but your timings for the loop based res_loop! are totally different…

julia> @btime res_ci!(a,b) setup=((a,b)=(zeros(34,34,34),rand(66,66,66)))
  327.769 μs (0 allocations: 0 bytes)
julia> @btime res_loop!(a,b) setup=((a,b)=(zeros(34,34,34),rand(66,66,66)))
  80.636 μs (0 allocations: 0 bytes)

And here is an extract of the @code_native output showing that AVX simd instructions are being used:

(a,b)=(zeros(34,34,34),rand(66,66,66)) 
@code_native debuginfo=:none res_loop!
...
        vmovupd (%r11,%rbp,2), %ymm0
        vmovupd 32(%r11,%rbp,2), %ymm1
        vunpcklpd       %ymm1, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
        vpermpd $216, %ymm2, %ymm2      # ymm2 = ymm2[0,2,1,3]
        vunpckhpd       %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
        vpermpd $216, %ymm0, %ymm0      # ymm0 = ymm0[0,2,1,3]
        vaddpd  %ymm0, %ymm2, %ymm0
        vmovupd (%rax,%rbp,2), %ymm1
        vmovupd 32(%rax,%rbp,2), %ymm2
        vunpcklpd       %ymm2, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
        vpermpd $216, %ymm3, %ymm3      # ymm3 = ymm3[0,2,1,3]
        vaddpd  %ymm3, %ymm0, %ymm0
        vunpckhpd       %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
        vpermpd $216, %ymm1, %ymm1      # ymm1 = ymm1[0,2,1,3]
        vaddpd  %ymm1, %ymm0, %ymm0
        vmovupd (%r8,%rbp,2), %ymm1
        vmovupd 32(%r8,%rbp,2), %ymm2
        vunpcklpd       %ymm2, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
        vpermpd $216, %ymm3, %ymm3      # ymm3 = ymm3[0,2,1,3]
        vaddpd  %ymm3, %ymm0, %ymm0
        vunpckhpd       %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
        vpermpd $216, %ymm1, %ymm1      # ymm1 = ymm1[0,2,1,3]
        vaddpd  %ymm1, %ymm0, %ymm0
        vmovupd (%rdx,%rbp,2), %ymm1
        vmovupd 32(%rdx,%rbp,2), %ymm2
        vunpcklpd       %ymm2, %ymm1, %ymm3 # ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
        vpermpd $216, %ymm3, %ymm3      # ymm3 = ymm3[0,2,1,3]
        vaddpd  %ymm3, %ymm0, %ymm0
        vunpckhpd       %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
        vpermpd $216, %ymm1, %ymm1      # ymm1 = ymm1[0,2,1,3]
        vaddpd  %ymm1, %ymm0, %ymm0
        vmovupd %ymm0, (%r15,%rbp)
        addq    $32, %rbp
        addq    $-4, %rdi
        jne     L1104
        cmpq    %rbx, -128(%rsp)
        je      L640
        nopl    (%rax,%rax)

Note that I have no vector instructions for res_ci!

I don’t know what is happening…

do you have avx instructions on your computer ?
What optimization level do you use with Julia ?

weymouth · July 8, 2020, 5:41pm

I didn’t even know there WAS optimization with Julia. This is my third week.
I don’t have any more time to look at this today, but I’ll check it out on a different computer tomorrow.

LaurentPlagne · July 8, 2020, 5:49pm

Thanks and no hurry. At this point I really would like to use Cartesian indexes as you do because it is much more elegant and much less error prone (in addition to allow for any dimensions). But I simply do not the level of performance of the loop based implementation. Maybe @tim.holy or @Elrod could help on this topic.

tim.holy · July 8, 2020, 5:59pm

See https://github.com/JuliaLang/julia/issues/9080. I’ve never seen factors anywhere near as large as what you’re showing, though, can you test the specific benchmarks in that issue?

While (for me) not a huge effect, it’s sufficiently annoying that I recently submitted a PR (https://github.com/JuliaLang/julia/pull/35074) that would end it once and for all. Though to be fair, I wouldn’t have done that except as a dry run for eventually integrating LoopVectorization into Julia’s compiler. Currently the decision in a triage call is to not merge that PR but tackle it by other compiler optimizations. I don’t disagree, but I don’t know enough about LLVM to fix it myself so we are still a bit stuck where we were.

LaurentPlagne · July 8, 2020, 7:11pm

Thank you for chiming in !
I did run the 2D benchmark that you suggest and add implementations with explicit @simd.

function sumcart_manual(A::AbstractMatrix)
    s = 0.0
    @inbounds for j = 1:size(A,2), i = 1:size(A,1)
        s += A[i,j]
    end
    s
end

function sumcart_iter(A)
    s = 0.0
    @inbounds for I in CartesianIndices(size(A))
        s += A[I]
    end
    s
end

function sumcart_manual_simd(A::AbstractMatrix)
    s = 0.0
    @inbounds for j = 1:size(A,2)
        @simd for i = 1:size(A,1)
        s += A[i,j]
        end
    end
    s
end

function sumcart_iter_simd(A)
    s = 0.0
    @inbounds @simd for I in CartesianIndices(size(A))
        s += A[I]
    end
    s
end

A = rand(10^4,10^4);
sumcart_manual(A);
sumcart_iter(A);
sumcart_iter_simd(A);
sumcart_iter_simd(A);

@btime sumcart_manual($A)
@btime sumcart_iter($A)
@btime sumcart_manual_simd($A)
@btime sumcart_iter_simd($A)

I obtain not abstraction penalties for CartesianIndices (and strong speed-ups from @simd:

  105.508 ms (0 allocations: 0 bytes)
  117.573 ms (0 allocations: 0 bytes)
  35.990 ms (0 allocations: 0 bytes)
  36.004 ms (0 allocations: 0 bytes)
5.000183342010947e7

Nevertheless, the case I consider is much more complex than this one. The inside and near functions that computes indexes around a given CartesianIndex combined with the sum in a generator:

function res_ci!(rlc,rlf)
    @fastmath @inbounds @simd for I ∈ inside(rlc)
            rlc[I]=sum(@inbounds(rlf[J]) for J ∈ near(I))
    end
end

replacing an (ugly) explicit 3D loops:

function res_loop!(rlc::Array{Float64,3},rlf::Array{Float64,3})
   (nx,ny,nz) = size(rlc)
   @inbounds @simd for k=2:nz-1
       fk=2k-2
       for j=2:ny-1
           fj=2j-2
           for i=2:nx-1
               fi=2i-2
               rlc[i,j,k]=
               rlf[fi  ,fj  ,fk  ]+rlf[fi+1,fj  ,fk  ]+
               rlf[fi  ,fj+1,fk  ]+rlf[fi+1,fj+1,fk  ]+
               rlf[fi  ,fj  ,fk+1]+rlf[fi+1,fj  ,fk+1]+
               rlf[fi  ,fj+1,fk+1]+rlf[fi+1,fj+1,fk+1]               
           end
       end
   end
end

is likely more difficult to transform into a properly vectorized implementation

julia> @btime res_ci!(a,b) setup=((a,b)=(zeros(34,34,34),rand(66,66,66)))
  327.769 μs (0 allocations: 0 bytes)
julia> @btime res_loop!(a,b) setup=((a,b)=(zeros(34,34,34),rand(66,66,66)))
  80.636 μs (0 allocations: 0 bytes)

Elrod · July 8, 2020, 8:24pm

LoopVectorization does worse than @simd here because it currently only handles discontiguous loads/stores via gather/scatter instructions, which are much slower than vunpckpd and vpermpd. The latter only work with small strides, e.g. 2 like we have here, but the performance benefit is well worth it when they do work.

LoopVectorization and @simd’s performance could of course both be improved by vectorizing with a stride of 1 instead of 2. To do that, we’d need to change the data layout of rlf via something like:

function make_dim1_contig(rlf)
    reshaped = reshape(rlf, (2,size(rlf,1) >> 1, Base.tail(size(rlf))...))
    permutedims(reshaped, (2,1,ntuple(i -> i+2, ndims(rlf)-1)...))
end

so that what were odd and even elements of the first axis of rlf are now no longer interleaved, but contiguous. Indexing the new second axis chooses between odd or even.

Example

using BenchmarkTools
using LinearAlgebra
using Random
using Printf


@inline CI(a...) = CartesianIndex(a...)
@inline δ(a,d::Int) = CI(ntuple(i -> i==a ? 1 : 0, d))
@inline δ(a,I::CartesianIndex{N}) where {N} = δ(a,N)
@inline CR(a...) = CartesianIndices(a...)
@inline inside(M::NTuple{N,Int}) where {N} = CR(ntuple(i-> 2:M[i]-1,N))
@inline inside(a::Array; reverse::Bool=false) =
        reverse ? Iterators.reverse(inside(size(a))) : inside(size(a))
@inline inside_u(N::NTuple{n,T}) where {n,T} = CR(ntuple(i->2:N[i],n-1))
@inline near(I::CartesianIndex,a=0) = (2I-2oneunit(I)):(2I-oneunit(I)-δ(a,I))
function make_dim1_contig(rlf)
    reshaped = reshape(rlf, (2,size(rlf,1) >> 1, Base.tail(size(rlf))...))
    permutedims(reshaped, (2,1,ntuple(i -> i+2, ndims(rlf)-1)...))
end
function interleave_dims1and2(rlf)
    sz = size(rlf)
    @assert size(rlf,2) == 2
    sztt = Base.tail(Base.tail(sz))
    permuted = permutedims(rlf, (2,1,ntuple(i -> i+2, ndims(rlf)-2)...))
    reshape(permuted, (2size(rlf,1), sztt...))
end

function res_loop!(rlc::Array{Float64,2},rlf::Array{Float64,2})
    (nrows,ncols) = size(rlc)
    @inbounds @simd for j=2:ncols-1
        fj=2j-2
        for i=2:nrows-1
            fi=2i-2
             rlc[i,j]=(rlf[fi,fj]+rlf[fi+1,fj]+rlf[fi,fj+1]+rlf[fi+1,fj+1])
        end
    end
end

function res_loop!(rlc::Array{Float64,3},rlf::Array{Float64,3})
    (nx,ny,nz) = size(rlc)
    @inbounds @simd for k=2:nz-1
        fk=2k-2
        for j=2:ny-1
            fj=2j-2
            for i=2:nx-1
                fi=2i-2
                rlc[i,j,k]=
                rlf[fi  ,fj  ,fk  ]+rlf[fi+1,fj  ,fk  ]+
                rlf[fi  ,fj+1,fk  ]+rlf[fi+1,fj+1,fk  ]+
                rlf[fi  ,fj  ,fk+1]+rlf[fi+1,fj  ,fk+1]+
                rlf[fi  ,fj+1,fk+1]+rlf[fi+1,fj+1,fk+1]               
            end
        end
    end
end


function res_loop_fast!(rlc::Array{Float64,2},rlf::Array{Float64,3})
    (nrows,ncols) = size(rlc)
    @inbounds @simd for j=2:ncols-1
        fj=2j-2
        for i=2:nrows-1
             rlc[i,j]=(rlf[i-1,2,fj]+rlf[i,1,fj]+rlf[i-1,2,fj+1]+rlf[i,1,fj+1])
        end
    end
end

function res_loop_fast!(rlc::Array{Float64,3},rlf::Array{Float64,4})
    (nx,ny,nz) = size(rlc)
    @inbounds @simd for k=2:nz-1
        fk=2k-2
        for j=2:ny-1
            fj=2j-2
            for i=2:nx-1
                rlc[i,j,k]=
                rlf[i-1, 2 ,fj  ,fk  ]+rlf[i,1,fj  ,fk  ]+
                rlf[i-1, 2 ,fj+1,fk  ]+rlf[i,1,fj+1,fk  ]+
                rlf[i-1, 2 ,fj  ,fk+1]+rlf[i,1,fj  ,fk+1]+
                rlf[i-1, 2 ,fj+1,fk+1]+rlf[i,1,fj+1,fk+1]               
            end
        end
    end
end


#function that call the benchmarks
function benchmark_res(rlf_size)
    # initialize two 2D arrays (fine and coarse)
    rlc_size =  rlf_size .>>> 1
    Random.seed!(1234);
    rlf=rand(Float64,(rlf_size))
    rlc=rand(Float64,(rlc_size))
    # the following blocks checks that res and res_ci produce same results
    rlff = make_dim1_contig(rlf)
    rlc_fast = copy(rlc)
    res_loop!(rlc,rlf)
    res_loop_fast!(rlc_fast,rlff)
    @assert rlc ≈ rlc_fast
    # #The actual timings
    tloop=@belapsed res_loop!($rlc,$rlf)
    tci=@belapsed res_loop_fast!($rlc,$rlff)
    @printf("tloop=%.3e, tcontig=%.3e, slowdown=%.3f ,size=%s \n",tloop,tci,tci/tloop,size(rlf))
    # @show nf,ss(tloop),ss(tci),ss(tloop/tci)
    nothing
end


for n in [10,34,66,258]
    benchmark_res((n,n))
    benchmark_res((n,n,n))
end

However, this does not actually help:

tloop=2.127e-08, tcontig=2.952e-08, slowdown=1.388 ,size=(10, 10)
tloop=9.149e-08, tcontig=1.305e-07, slowdown=1.426 ,size=(10, 10, 10)
tloop=1.600e-07, tcontig=1.942e-07, slowdown=1.213 ,size=(34, 34)
tloop=5.939e-06, tcontig=9.311e-06, slowdown=1.568 ,size=(34, 34, 34)
tloop=5.305e-07, tcontig=8.079e-07, slowdown=1.523 ,size=(66, 66)
tloop=9.577e-05, tcontig=9.905e-05, slowdown=1.034 ,size=(66, 66, 66)
tloop=6.590e-06, tcontig=8.238e-06, slowdown=1.250 ,size=(258, 258)
tloop=9.952e-03, tcontig=1.004e-02, slowdown=1.009 ,size=(258, 258, 258)

The GFLOPS are very bad in all these benchmarks.
E.g.

julia> 7e-9 * ((258>>>1)-2)^3 / 9.952e-3
1.4407838625401928

julia> 7e-9 * ((34>>>1)-2)^3 / 5.939e-6
3.977942414547903

1.4 GFLOPS… I guess this is severly constrained by cache bandwidth. Note that those (faster) times were actually from the discontiguous benchmark.

The contiguous version also suffers from not breaking up the dependcy chain:

L1312:
        vmovupd zmm6, zmmword ptr [rbp + 8*rcx]
        vaddpd  zmm6, zmm6, zmmword ptr [r8 + 8*rcx]
        vaddpd  zmm6, zmm6, zmmword ptr [rdx + 8*rcx]
        vaddpd  zmm6, zmm6, zmmword ptr [r14 + 8*rcx]
        vaddpd  zmm6, zmm6, zmmword ptr [rsi + 8*rcx]
        vaddpd  zmm6, zmm6, zmmword ptr [rax + 8*rcx]
        vaddpd  zmm6, zmm6, zmmword ptr [rdi + 8*rcx]
        vaddpd  zmm6, zmm6, zmmword ptr [r11 + 8*rcx]
        vmovupd zmmword ptr [r12 + 8*rcx], zmm6
        add     rcx, 8
        cmp     rbx, rcx
        jne     L1312

But if the CPU were actually busy here, it would have

julia> 4.1 * 8 * 0.25
8.2

8.2 GFLOPS. That is 4.1 GHz * 8 ops/instruction * 0.25 instructions/cycle. The theoretical per-core peak of this CPU is:

julia> 4.1 * 16 * 2
131.2

16 ops/instruction (fma), and up to 2 fmas/cycle from splitting the dependency chain. It gets close to this when doing matrix multiplication.
Given that we aren’t even hitting the 8 GFLOPS, we’re obviously suffering from other bottlenecks as well.

This was the corresponding loop from the discontiguous version;

L992:
        vmovupd zmm0, zmmword ptr [rdi + 2*rsi]
        vmovupd zmm1, zmmword ptr [rdi + 2*rsi + 64]
        vmovapd zmm12, zmm0
        vpermt2pd       zmm12, zmm6, zmm1
        vpermt2pd       zmm0, zmm7, zmm1
        vaddpd  zmm0, zmm12, zmm0
        vmovupd zmm1, zmmword ptr [rax + 2*rsi]
        vmovupd zmm12, zmmword ptr [rax + 2*rsi + 64]
        vmovapd zmm13, zmm1
        vpermt2pd       zmm13, zmm6, zmm12
        vaddpd  zmm0, zmm0, zmm13
        vpermt2pd       zmm1, zmm7, zmm12
        vaddpd  zmm0, zmm0, zmm1
        vmovupd zmm1, zmmword ptr [rcx + 2*rsi]
        vmovupd zmm12, zmmword ptr [rcx + 2*rsi + 64]
        vmovapd zmm13, zmm1
        vpermt2pd       zmm13, zmm6, zmm12
        vaddpd  zmm0, zmm0, zmm13
        vpermt2pd       zmm1, zmm7, zmm12
        vaddpd  zmm0, zmm0, zmm1
        vmovupd zmm1, zmmword ptr [rbx + 2*rsi]
        vmovupd zmm12, zmmword ptr [rbx + 2*rsi + 64]
        vmovapd zmm13, zmm1
        vpermt2pd       zmm13, zmm6, zmm12
        vaddpd  zmm0, zmm0, zmm13
        vpermt2pd       zmm1, zmm7, zmm12
        vaddpd  zmm0, zmm0, zmm1
        vmovupd zmmword ptr [r12 + rsi], zmm0
        add     rsi, 64
        add     r10, -8
        jne     L992

That’s a lot messier, but given something else was the bottleneck, we need to address that before this matters.

LaurentPlagne · July 9, 2020, 7:24am

You address the relevant question of the absolute performance and optimization of the loop based implementation. Although, this question is the most relevant one, my initial concern was about the performance gap between the compact CartesianIndex based implementation and the loop based implementation.

As @tim.holy suggested, this gap does not appear for all CIs kernels and maybe related this specific function.

I should spend more time to investigate this but I must switch back to more immediate professional tasks…

Thank you all for the inputs

I still think that a generic and performant GMG Julia package will be of great interest and I hope to be able to contribute !

weymouth · July 9, 2020, 10:42am

It turns out Julia is mostly giving me long overdue lessons in computer science and compilers!

First thing I want to mention is that while restrict! is a very important routine for GMG, it isn’t actually a performance bottleneck. Profiling 100 calls to solve! on a 2D or 3D problem gives function call counts similar to

24859 solve!
18442 GS!
12650 Vcycle!
 9749 increment!
 6684 mult
...
 1426 prolongate
...
 1128 restrict

So you can see that almost all the time is taken up by the Gauss-Seidel smoother GS!, and in particular by the solution and residual increment! and row multiplication routine mult. Vcycle just calls other routines.

So maybe it would be more fruitful to focus the conversation on increment! or mult!?

LaurentPlagne · July 9, 2020, 1:06pm

You are right.
I chose the restrict because it was simpler. I will look into that.
Thanks

weymouth · July 9, 2020, 1:48pm

I’m going to look into increment!.

I’m getting effectively no penalty using CartesianIndex for x[I] += ϵ[I]. ~~However, updating the residual requires doing a matrix multiplication, and this is where CartesianIndex seems to cost us.~~

Edit: sum(f for i in 1:N) seemed to be causing trouble again (see Macro to sum over short loop - #9 by ffevotte). CartesianIndex is just as fast if I manually unroll the loop. Of course, this is no longer dimensionally independent…
Edit 2: @inbounds!

Summary

@inline δ(a,N::Int) = CartesianIndex(ntuple(i -> i==a ? 1 : 0, N))
@fastmath function incrementCI!(ϵ::Array{T,N},r::Array{T,N},L::Array,D::Array{T,N}) where {T,N}
    R = CartesianIndex(size(r))
    @inbounds @simd for I ∈ 2oneunit(R):R-oneunit(R)
        σ = (
        ϵ[I]*D[I]
        +sum(@inbounds(ϵ[I-δ(i,N)]*L[I,i]) for i=1:N)
        +sum(@inbounds(ϵ[I+δ(i,N)]*L[I+δ(i,N),i]) for i=1:N))
        r[I] -= σ
    end
end
@fastmath function increment3D!(ϵ::Array,r::Array,L::Array,D::Array)
    (ni,nj,nk) = size(r)
    @inbounds @simd for k = 2:nk-1
        for j = 2:nj-1
            for i = 2:ni-1
                σ = (
                D[i,j,k]*ϵ[i,j,k]
                +L[i,j,k,1]*ϵ[i-1,j,k]
                +L[i,j,k,2]*ϵ[i,j-1,k]
                +L[i,j,k,3]*ϵ[i,j,k-1]
                +L[i+1,j,k,1]*ϵ[i+1,j,k]
                +L[i,j+1,k,2]*ϵ[i,j+1,k]
                +L[i,j,k+1,3]*ϵ[i,j,k+1])
                r[i,j,k] -= σ
            end
        end
    end
end

function test3D(n)
    L = ones(2^n+2,2^n+2,2^n+2,3)
    D = -6*ones(2^n+2,2^n+2,2^n+2)
    r = zeros(2^n+2,2^n+2,2^n+2)
    ϵ = rand(2^n+2,2^n+2,2^n+2)
    return ϵ,r,L,D
end
ϵ,r,L,D = test3D(7)

julia> @btime incrementCI!($ϵ,$r,$L,$D)
  4.388 ms (0 allocations: 0 bytes)
julia> @btime increment3D!($ϵ,$r,$L,$D)
  4.382 ms (0 allocations: 0 bytes)

Edit: So this code is just as fast as the manually indexed version, and it’s the most important.

Topic		Replies	Views
Trying to speed-up Laplace Equation Solver Performance	36	1812	August 1, 2020
Understanding Julia performance in simple finite difference code New to Julia	38	4512	June 7, 2018
@inbounds: is the compiler now so smart that this is no longer necessary? Performance	33	2969	July 16, 2018
Partial trace of multidimensional arrays General Usage	6	2749	January 21, 2017
Unusual slow Julia code compared to C++? Performance	21	4527	July 30, 2019

Multigrid solver prototype (GMG) and simple Lid Cavity solver

Related topics