Arithmetic performance of expression

Hello!,

Recently I’ve noticed that Julia performs better when I don’t associate the terms in a expression im using in a for loop. Why is (1), faster than (2)? According to @btime (2) is 0.3 seconds slower than (1) after I double loop in a big mesh. This is probably basic but I’m trying to get better at producing high performance code and the benchmarks go against my intuition.

(1) (x/11.0)*(a+b+c)+(d+e)*y/11.0+f*z/11.0
(2) (x*(a+b+c)+(d+e)*y+f*z)/11.0

Have a reproducible example?

2 Likes

yeah. 2 really should be faster here. it would be interesting to see the benchmark.

3 Likes

@Oscar_Smith, @Elrod Couldn’t set up a minimal working example yesterday but here’s an example where I consistenly get higher btimes for 2

function foo1()
   dt=0.125
   C=0.3
   C2=C^2
   Nt=40000
   Nx=12000
   m=rand(Nx,Nt+1)
for j = 4:Nt
   @tturbo for i=3:Nx-2
      m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]) 
   end
end
end


function foo2()
   dt=0.125
   C=0.3
   C2=C^2
   Nt=40000
   Nx=12000
   m=rand(Nx,Nt+1)
for j = 4:Nt
   @tturbo for i=3:Nx-2
      m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0 
   end
  end
end

@btime foo1()
@btime foo2()
```

I removed @tturbo and I got

  3.721 s (2 allocations: 3.58 GiB)
  3.708 s (2 allocations: 3.58 GiB)

Also, at least for me half of the time is spent in allocating the big arrays

That’s interesting, it might be related to how @tturbo works. I also get better times for foo1() when removing the macro.

I’d also suggest benchmarking something like

using BenchmarkTools

function foo1!(m, dt, C, C2, Nt, Nx)
    for j = 4:Nt
        for i=3:Nx-2
            m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]) 
        end
    end
end


function foo2!(m, dt, C, C2, Nt, Nx)
    for j = 4:Nt
        for i=3:Nx-2
            m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0 
        end
    end
end

@btime foo1() setup = (
    dt=0.125;
    C=0.3;
    C2=C^2;
    Nt=40000;
    Nx=12000;
    m=rand(Nx,Nt+1);
)

@btime foo2() setup = (
    dt=0.125;
    C=0.3;
    C2=C^2;
    Nt=40000;
    Nx=12000;
    m=rand(Nx,Nt+1);
)

to remove allocating the big arrays from the run, they create lots of noise.

Comparing @tturbo and @inbounds @fastmath versions:

julia> GC.gc(); @time foo1()
  1.922473 seconds (2 allocations: 3.576 GiB, 0.12% gc time)

julia> GC.gc(); @time foo1_fm()
  2.182295 seconds (2 allocations: 3.576 GiB, 0.04% gc time)

julia> GC.gc(); @time foo2()
  1.920486 seconds (2 allocations: 3.576 GiB, 0.05% gc time)

julia> GC.gc(); @time foo2_fm()
  2.186049 seconds (2 allocations: 3.576 GiB, 0.04% gc time)

I don’t see any difference between foo1 and foo2.

Why? Out of curiosity. Fewer divisions?

Modification of @giordano 's code
julia> function foo1!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
               end
           end
       end
foo1! (generic function with 1 method)

julia> function foo1_fm!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @inbounds @fastmath for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
               end
           end
       end
foo1_fm! (generic function with 1 method)

julia> function foo2!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
               end
           end
       end
foo2! (generic function with 1 method)

julia> function foo2_fm!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @inbounds @fastmath for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
               end
           end
       end
foo2_fm! (generic function with 1 method)

julia> using LoopVectorization

julia> function foo1!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @turbo for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
               end
           end
       end
foo1! (generic function with 1 method)

julia> function foo1!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
               end
           end
       end
foo1! (generic function with 1 method)

julia> function foo1_fm!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @inbounds @fastmath for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
               end
           end
       end
foo1_fm! (generic function with 1 method)

julia> function foo2!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
               end
           end
       end
foo2! (generic function with 1 method)

julia> function foo2_fm!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @inbounds @fastmath for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
               end
           end
       end
foo2_fm! (generic function with 1 method)

julia> using LoopVectorization

julia> function foo1_turbo!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @turbo for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
               end
           end
       end
foo1_turbo! (generic function with 1 method)

julia> function foo2_turbo!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @turbo for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
               end
           end
       end
foo2_turbo! (generic function with 1 method)

julia> function foo1_inbounds!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @inbounds for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
               end
           end
       end
foo1_inbounds! (generic function with 1 method)

julia> function foo2_inbounds!(m, dt, C, C2, Nt, Nx)
           for j = 4:Nt
               @inbounds for i=3:Nx-2
                   m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
               end
           end
       end
foo2_inbounds! (generic function with 1 method)

results

julia> @btime foo1!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  1.586 s (0 allocations: 0 bytes)

julia> @btime foo2!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  1.767 s (0 allocations: 0 bytes)

julia> @btime foo1_inbounds!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  445.422 ms (0 allocations: 0 bytes)

julia> @btime foo2_inbounds!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  445.125 ms (0 allocations: 0 bytes)

julia> @btime foo1_fm!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  388.491 ms (0 allocations: 0 bytes)

julia> @btime foo2_fm!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  387.870 ms (0 allocations: 0 bytes)

julia> @btime foo1_turbo!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  360.542 ms (0 allocations: 0 bytes)

julia> @btime foo2_turbo!(m, dt, C, C2, Nt, Nx) setup = (
           dt=0.125;
           C=0.3;
           C2=C^2;
           Nt=40000;
           Nx=12000;
           m=rand(Nx,Nt+1);
       )
  355.190 ms (0 allocations: 0 bytes)

I don’t see much of a difference, except without any annotations

@giordano @Elrod I’ve run the same code you posted in my computer and somehow I get the following times for the turbo versions. It’s not such a huge difference but I’m puzzled about how the foo2_turbo performs a little worst in my computer. Could it be related to the fact that I’m using an M1 processor?

(Aside question, while tweaking my code I noticed that it performs better when I don’t use the * multiplication symbol just as you did, is that normal?)

296.276 ms (0 allocations: 0 bytes)
319.983 ms (0 allocations: 0 bytes)

Using much smaller sizes

julia> Nt = 400; Nx = 120;

julia> @btime foo1!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  165.522 μs (0 allocations: 0 bytes)

julia> @btime foo2!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  184.462 μs (0 allocations: 0 bytes)

julia> @btime foo1_inbounds!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  42.759 μs (0 allocations: 0 bytes)

julia> @btime foo2_inbounds!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  45.094 μs (0 allocations: 0 bytes)

julia> @btime foo1_fm!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  27.266 μs (0 allocations: 0 bytes)

julia> @btime foo2_fm!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  26.341 μs (0 allocations: 0 bytes)

julia> @btime foo1_turbo!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  18.326 μs (0 allocations: 0 bytes)

julia> @btime foo2_turbo!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=$Nt; Nx=$Nx; m=rand(Nx,Nt+1);)
  19.072 μs (0 allocations: 0 bytes)

FWIW, assembly of the @turbo versions, foo1:

.LBB0_3:                                # %L84
                                        #   Parent Loop BB0_2 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovupd zmm10, zmmword ptr [rbp + 8*r15 - 152]
        vmovupd zmm11, zmmword ptr [rbp + 8*r15 - 144]
        vmovupd zmm12, zmmword ptr [rbp + 8*r15 - 128]
        vmovupd zmm13, zmmword ptr [rbp + 8*r15 - 88]
        vmovupd zmm14, zmmword ptr [rbp + 8*r15 - 24]
        vaddpd  zmm10, zmm10, zmmword ptr [rbp + 8*r15 - 136]
        vaddpd  zmm13, zmm13, zmmword ptr [rbp + 8*r15 - 72]
        vaddpd  zmm14, zmm14, zmmword ptr [rbp + 8*r15 - 8]
        vmovupd zmm15, zmmword ptr [rbp + 8*r15 - 80]
        vmovupd zmm16, zmmword ptr [rbp + 8*r15 - 16]
        vmovupd zmm17, zmmword ptr [rbp + 8*r15 - 64]
        vmovupd zmm18, zmmword ptr [rbp + 8*r15]
        vaddpd  zmm12, zmm12, zmmword ptr [rbp + 8*r15 - 160]
        vaddpd  zmm17, zmm17, zmmword ptr [rbp + 8*r15 - 96]
        vaddpd  zmm18, zmm18, zmmword ptr [rbp + 8*r15 - 32]
        vfmadd231pd     zmm12, zmm11, zmm2      # zmm12 = (zmm11 * zmm2) + zmm12
        vfmadd231pd     zmm17, zmm15, zmm2      # zmm17 = (zmm15 * zmm2) + zmm17
        vfmadd231pd     zmm18, zmm16, zmm2      # zmm18 = (zmm16 * zmm2) + zmm18
        vfnmadd231pd    zmm12, zmm3, zmm10      # zmm12 = -(zmm3 * zmm10) + zmm12
        vfnmadd231pd    zmm17, zmm3, zmm13      # zmm17 = -(zmm3 * zmm13) + zmm17
        vfnmadd231pd    zmm18, zmm3, zmm14      # zmm18 = -(zmm3 * zmm14) + zmm18
        vmovupd zmm10, zmmword ptr [rcx + 8*r15 - 128]
        vmovupd zmm13, zmmword ptr [rcx + 8*r15 - 64]
        vmovupd zmm14, zmmword ptr [rcx + 8*r15]
        vfmadd213pd     zmm10, zmm4, zmmword ptr [r13 + 8*r15 + 16] # zmm10 = (zmm4 * zmm10) + mem
        vfmadd213pd     zmm13, zmm4, zmmword ptr [r13 + 8*r15 + 80] # zmm13 = (zmm4 * zmm13) + mem
        vfmadd213pd     zmm14, zmm4, zmmword ptr [r13 + 8*r15 + 144] # zmm14 = (zmm4 * zmm14) + mem
        vfmadd231pd     zmm10, zmm5, zmmword ptr [rbx + 8*r15 - 128] # zmm10 = (zmm5 * mem) + zmm10
        vfmadd231pd     zmm13, zmm5, zmmword ptr [rbx + 8*r15 - 64] # zmm13 = (zmm5 * mem) + zmm13
        vfmadd231pd     zmm14, zmm5, zmmword ptr [rbx + 8*r15] # zmm14 = (zmm5 * mem) + zmm14
        vfmadd231pd     zmm10, zmm6, zmm11      # zmm10 = (zmm6 * zmm11) + zmm10
        vfmadd231pd     zmm13, zmm6, zmm15      # zmm13 = (zmm6 * zmm15) + zmm13
        vfmadd231pd     zmm14, zmm6, zmm16      # zmm14 = (zmm6 * zmm16) + zmm14
        vmulpd  zmm10, zmm10, zmm9
        vmulpd  zmm11, zmm13, zmm7
        vmulpd  zmm13, zmm14, zmm7
        vfmsub231pd     zmm10, zmm0, zmm12      # zmm10 = (zmm0 * zmm12) - zmm10
        vfmsub231pd     zmm11, zmm0, zmm17      # zmm11 = (zmm0 * zmm17) - zmm11
        vfmsub231pd     zmm13, zmm0, zmm18      # zmm13 = (zmm0 * zmm18) - zmm13
        vfmsub231pd     zmm10, zmm1, zmm8       # zmm10 = (zmm1 * zmm8) - zmm10
        vfmsub231pd     zmm11, zmm1, zmm8       # zmm11 = (zmm1 * zmm8) - zmm11
        vfmsub231pd     zmm13, zmm1, zmm8       # zmm13 = (zmm1 * zmm8) - zmm13
        vmovupd zmmword ptr [rax + 8*r15 - 128], zmm10
        vmovupd zmmword ptr [rax + 8*r15 - 64], zmm11
        vmovupd zmmword ptr [rax + 8*r15], zmm13
        add     r15, 24
        cmp     r15, rsi
        jle     .LBB0_3

foo2:

.LBB0_3:                                # %L82
                                        #   Parent Loop BB0_2 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovupd zmm12, zmmword ptr [rcx + 8*r15 - 80]
        vmovupd zmm13, zmmword ptr [rcx + 8*r15 - 16]
        vmovupd zmm14, zmmword ptr [rcx + 8*r15 - 152]
        vmovupd zmm15, zmmword ptr [rcx + 8*r15 - 144]
        vmovupd zmm16, zmmword ptr [rcx + 8*r15 - 128]
        vmovupd zmm17, zmmword ptr [rcx + 8*r15 - 88]
        vmovupd zmm18, zmmword ptr [rcx + 8*r15 - 24]
        vaddpd  zmm14, zmm14, zmmword ptr [rcx + 8*r15 - 136]
        vaddpd  zmm17, zmm17, zmmword ptr [rcx + 8*r15 - 72]
        vaddpd  zmm18, zmm18, zmmword ptr [rcx + 8*r15 - 8]
        vmovupd zmm19, zmmword ptr [rcx + 8*r15 - 64]
        vmovupd zmm20, zmmword ptr [rcx + 8*r15]
        vaddpd  zmm16, zmm16, zmmword ptr [rcx + 8*r15 - 160]
        vaddpd  zmm19, zmm19, zmmword ptr [rcx + 8*r15 - 96]
        vaddpd  zmm20, zmm20, zmmword ptr [rcx + 8*r15 - 32]
        vfmadd231pd     zmm16, zmm15, zmm2      # zmm16 = (zmm15 * zmm2) + zmm16
        vfmadd231pd     zmm19, zmm12, zmm2      # zmm19 = (zmm12 * zmm2) + zmm19
        vfmadd231pd     zmm20, zmm13, zmm2      # zmm20 = (zmm13 * zmm2) + zmm20
        vfnmadd231pd    zmm16, zmm3, zmm14      # zmm16 = -(zmm3 * zmm14) + zmm16
        vfnmadd231pd    zmm19, zmm3, zmm17      # zmm19 = -(zmm3 * zmm17) + zmm19
        vfnmadd231pd    zmm20, zmm3, zmm18      # zmm20 = -(zmm3 * zmm18) + zmm20
        vfmsub213pd     zmm16, zmm0, zmmword ptr [r13 + 8*r15 + 16] # zmm16 = (zmm0 * zmm16) - mem
        vfmsub213pd     zmm19, zmm0, zmmword ptr [r13 + 8*r15 + 80] # zmm19 = (zmm0 * zmm19) - mem
        vfmsub213pd     zmm20, zmm0, zmmword ptr [r13 + 8*r15 + 144] # zmm20 = (zmm0 * zmm20) - mem
        vfnmadd231pd    zmm16, zmm1, zmm4       # zmm16 = -(zmm1 * zmm4) + zmm16
        vfnmadd231pd    zmm19, zmm1, zmm4       # zmm19 = -(zmm1 * zmm4) + zmm19
        vfnmadd231pd    zmm20, zmm1, zmm4       # zmm20 = -(zmm1 * zmm4) + zmm20
        vfnmadd231pd    zmm16, zmm5, zmmword ptr [rbp + 8*r15 - 128] # zmm16 = -(zmm5 * mem) + zmm16
        vfnmadd231pd    zmm19, zmm5, zmmword ptr [rbp + 8*r15 - 64] # zmm19 = -(zmm5 * mem) + zmm19
        vfnmadd231pd    zmm20, zmm5, zmmword ptr [rbp + 8*r15] # zmm20 = -(zmm5 * mem) + zmm20
        vfnmadd231pd    zmm16, zmm6, zmmword ptr [rbx + 8*r15 - 128] # zmm16 = -(zmm6 * mem) + zmm16
        vfnmadd231pd    zmm19, zmm6, zmmword ptr [rbx + 8*r15 - 64] # zmm19 = -(zmm6 * mem) + zmm19
        vfnmadd231pd    zmm20, zmm6, zmmword ptr [rbx + 8*r15] # zmm20 = -(zmm6 * mem) + zmm20
        vfmsub231pd     zmm16, zmm7, zmm15      # zmm16 = (zmm7 * zmm15) - zmm16
        vmulpd  zmm14, zmm16, zmm11
        vfmsub231pd     zmm19, zmm7, zmm12      # zmm19 = (zmm7 * zmm12) - zmm19
        vfmsub231pd     zmm20, zmm7, zmm13      # zmm20 = (zmm7 * zmm13) - zmm20
        vmulpd  zmm12, zmm19, zmm9
        vmulpd  zmm13, zmm20, zmm9
        vmovupd zmmword ptr [rax + 8*r15 - 128], zmm14
        vmovupd zmmword ptr [rax + 8*r15 - 64], zmm12
        vmovupd zmmword ptr [rax + 8*r15], zmm13
        add     r15, 24
        cmp     r15, rsi
        jle     .LBB0_3

Assembly doesn’t look strikingly different. No divisions, it’s been replaced with multiplication by the inverse.
Both versions require 10 arithmetic instructions, a mix of adds, fmas, and multiplies.

I have an M1 at home, so I can take a look later.

1 Like