# Arithmetic performance of expression

Hello!,

Recently I’ve noticed that Julia performs better when I don’t associate the terms in a expression im using in a for loop. Why is (1), faster than (2)? According to @btime (2) is 0.3 seconds slower than (1) after I double loop in a big mesh. This is probably basic but I’m trying to get better at producing high performance code and the benchmarks go against my intuition.

(1) `(x/11.0)*(a+b+c)+(d+e)*y/11.0+f*z/11.0`
(2) `(x*(a+b+c)+(d+e)*y+f*z)/11.0`

Have a reproducible example?

2 Likes

yeah. 2 really should be faster here. it would be interesting to see the benchmark.

3 Likes

@Oscar_Smith, @Elrod Couldn’t set up a minimal working example yesterday but here’s an example where I consistenly get higher btimes for 2

``````function foo1()
dt=0.125
C=0.3
C2=C^2
Nt=40000
Nx=12000
m=rand(Nx,Nt+1)
for j = 4:Nt
@tturbo for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end

function foo2()
dt=0.125
C=0.3
C2=C^2
Nt=40000
Nx=12000
m=rand(Nx,Nt+1)
for j = 4:Nt
@tturbo for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end

@btime foo1()
@btime foo2()
`````````

I removed `@tturbo` and I got

``````  3.721 s (2 allocations: 3.58 GiB)
3.708 s (2 allocations: 3.58 GiB)
``````

Also, at least for me half of the time is spent in allocating the big arrays

That’s interesting, it might be related to how @tturbo works. I also get better times for foo1() when removing the macro.

I’d also suggest benchmarking something like

``````using BenchmarkTools

function foo1!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end

function foo2!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end

@btime foo1() setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)

@btime foo2() setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
``````

to remove allocating the big arrays from the run, they create lots of noise.

Comparing `@tturbo` and `@inbounds @fastmath` versions:

``````julia> GC.gc(); @time foo1()
1.922473 seconds (2 allocations: 3.576 GiB, 0.12% gc time)

julia> GC.gc(); @time foo1_fm()
2.182295 seconds (2 allocations: 3.576 GiB, 0.04% gc time)

julia> GC.gc(); @time foo2()
1.920486 seconds (2 allocations: 3.576 GiB, 0.05% gc time)

julia> GC.gc(); @time foo2_fm()
2.186049 seconds (2 allocations: 3.576 GiB, 0.04% gc time)
``````

I don’t see any difference between `foo1` and `foo2`.

Why? Out of curiosity. Fewer divisions?

Modification of @giordano 's code
``````julia> function foo1!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end
foo1! (generic function with 1 method)

julia> function foo1_fm!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@inbounds @fastmath for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end
foo1_fm! (generic function with 1 method)

julia> function foo2!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end
foo2! (generic function with 1 method)

julia> function foo2_fm!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@inbounds @fastmath for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end
foo2_fm! (generic function with 1 method)

julia> using LoopVectorization

julia> function foo1!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@turbo for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end
foo1! (generic function with 1 method)

julia> function foo1!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end
foo1! (generic function with 1 method)

julia> function foo1_fm!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@inbounds @fastmath for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end
foo1_fm! (generic function with 1 method)

julia> function foo2!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end
foo2! (generic function with 1 method)

julia> function foo2_fm!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@inbounds @fastmath for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end
foo2_fm! (generic function with 1 method)

julia> using LoopVectorization

julia> function foo1_turbo!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@turbo for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end
foo1_turbo! (generic function with 1 method)

julia> function foo2_turbo!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@turbo for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end
foo2_turbo! (generic function with 1 method)

julia> function foo1_inbounds!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@inbounds for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3])/11.0-(12.0/11.0)*dt^2+(C2/11.0)*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j])
end
end
end
foo1_inbounds! (generic function with 1 method)

julia> function foo2_inbounds!(m, dt, C, C2, Nt, Nx)
for j = 4:Nt
@inbounds for i=3:Nx-2
m[i,j+1] = (20.0*m[i,j]-6.0*m[i,j-1]-4.0*m[i,j-2]+m[i,j-3]-12.0*dt^2+C2*(16.0*(m[i-1,j]+m[i+1,j])-30.0*m[i,j]-m[i+2,j]-m[i-2,j]))/11.0
end
end
end
foo2_inbounds! (generic function with 1 method)
``````

results

``````julia> @btime foo1!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
1.586 s (0 allocations: 0 bytes)

julia> @btime foo2!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
1.767 s (0 allocations: 0 bytes)

julia> @btime foo1_inbounds!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
445.422 ms (0 allocations: 0 bytes)

julia> @btime foo2_inbounds!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
445.125 ms (0 allocations: 0 bytes)

julia> @btime foo1_fm!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
388.491 ms (0 allocations: 0 bytes)

julia> @btime foo2_fm!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
387.870 ms (0 allocations: 0 bytes)

julia> @btime foo1_turbo!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
360.542 ms (0 allocations: 0 bytes)

julia> @btime foo2_turbo!(m, dt, C, C2, Nt, Nx) setup = (
dt=0.125;
C=0.3;
C2=C^2;
Nt=40000;
Nx=12000;
m=rand(Nx,Nt+1);
)
355.190 ms (0 allocations: 0 bytes)
``````

I don’t see much of a difference, except without any annotations

@giordano @Elrod I’ve run the same code you posted in my computer and somehow I get the following times for the turbo versions. It’s not such a huge difference but I’m puzzled about how the foo2_turbo performs a little worst in my computer. Could it be related to the fact that I’m using an M1 processor?

(Aside question, while tweaking my code I noticed that it performs better when I don’t use the * multiplication symbol just as you did, is that normal?)

``````296.276 ms (0 allocations: 0 bytes)
319.983 ms (0 allocations: 0 bytes)``````

Using much smaller sizes

``````julia> Nt = 400; Nx = 120;

julia> @btime foo1!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
165.522 μs (0 allocations: 0 bytes)

julia> @btime foo2!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
184.462 μs (0 allocations: 0 bytes)

julia> @btime foo1_inbounds!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
42.759 μs (0 allocations: 0 bytes)

julia> @btime foo2_inbounds!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
45.094 μs (0 allocations: 0 bytes)

julia> @btime foo1_fm!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
27.266 μs (0 allocations: 0 bytes)

julia> @btime foo2_fm!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
26.341 μs (0 allocations: 0 bytes)

julia> @btime foo1_turbo!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
18.326 μs (0 allocations: 0 bytes)

julia> @btime foo2_turbo!(m, dt, C, C2, Nt, Nx) setup = (dt=0.125; C=0.3; C2=C^2; Nt=\$Nt; Nx=\$Nx; m=rand(Nx,Nt+1);)
19.072 μs (0 allocations: 0 bytes)
``````

FWIW, assembly of the `@turbo` versions, foo1:

``````.LBB0_3:                                # %L84
#   Parent Loop BB0_2 Depth=1
# =>  This Inner Loop Header: Depth=2
vmovupd zmm10, zmmword ptr [rbp + 8*r15 - 152]
vmovupd zmm11, zmmword ptr [rbp + 8*r15 - 144]
vmovupd zmm12, zmmword ptr [rbp + 8*r15 - 128]
vmovupd zmm13, zmmword ptr [rbp + 8*r15 - 88]
vmovupd zmm14, zmmword ptr [rbp + 8*r15 - 24]
vaddpd  zmm10, zmm10, zmmword ptr [rbp + 8*r15 - 136]
vaddpd  zmm13, zmm13, zmmword ptr [rbp + 8*r15 - 72]
vaddpd  zmm14, zmm14, zmmword ptr [rbp + 8*r15 - 8]
vmovupd zmm15, zmmword ptr [rbp + 8*r15 - 80]
vmovupd zmm16, zmmword ptr [rbp + 8*r15 - 16]
vmovupd zmm17, zmmword ptr [rbp + 8*r15 - 64]
vmovupd zmm18, zmmword ptr [rbp + 8*r15]
vaddpd  zmm12, zmm12, zmmword ptr [rbp + 8*r15 - 160]
vaddpd  zmm17, zmm17, zmmword ptr [rbp + 8*r15 - 96]
vaddpd  zmm18, zmm18, zmmword ptr [rbp + 8*r15 - 32]
vfmadd231pd     zmm12, zmm11, zmm2      # zmm12 = (zmm11 * zmm2) + zmm12
vfmadd231pd     zmm17, zmm15, zmm2      # zmm17 = (zmm15 * zmm2) + zmm17
vfmadd231pd     zmm18, zmm16, zmm2      # zmm18 = (zmm16 * zmm2) + zmm18
vfnmadd231pd    zmm12, zmm3, zmm10      # zmm12 = -(zmm3 * zmm10) + zmm12
vfnmadd231pd    zmm17, zmm3, zmm13      # zmm17 = -(zmm3 * zmm13) + zmm17
vfnmadd231pd    zmm18, zmm3, zmm14      # zmm18 = -(zmm3 * zmm14) + zmm18
vmovupd zmm10, zmmword ptr [rcx + 8*r15 - 128]
vmovupd zmm13, zmmword ptr [rcx + 8*r15 - 64]
vmovupd zmm14, zmmword ptr [rcx + 8*r15]
vfmadd213pd     zmm10, zmm4, zmmword ptr [r13 + 8*r15 + 16] # zmm10 = (zmm4 * zmm10) + mem
vfmadd213pd     zmm13, zmm4, zmmword ptr [r13 + 8*r15 + 80] # zmm13 = (zmm4 * zmm13) + mem
vfmadd213pd     zmm14, zmm4, zmmword ptr [r13 + 8*r15 + 144] # zmm14 = (zmm4 * zmm14) + mem
vfmadd231pd     zmm10, zmm5, zmmword ptr [rbx + 8*r15 - 128] # zmm10 = (zmm5 * mem) + zmm10
vfmadd231pd     zmm13, zmm5, zmmword ptr [rbx + 8*r15 - 64] # zmm13 = (zmm5 * mem) + zmm13
vfmadd231pd     zmm14, zmm5, zmmword ptr [rbx + 8*r15] # zmm14 = (zmm5 * mem) + zmm14
vfmadd231pd     zmm10, zmm6, zmm11      # zmm10 = (zmm6 * zmm11) + zmm10
vfmadd231pd     zmm13, zmm6, zmm15      # zmm13 = (zmm6 * zmm15) + zmm13
vfmadd231pd     zmm14, zmm6, zmm16      # zmm14 = (zmm6 * zmm16) + zmm14
vmulpd  zmm10, zmm10, zmm9
vmulpd  zmm11, zmm13, zmm7
vmulpd  zmm13, zmm14, zmm7
vfmsub231pd     zmm10, zmm0, zmm12      # zmm10 = (zmm0 * zmm12) - zmm10
vfmsub231pd     zmm11, zmm0, zmm17      # zmm11 = (zmm0 * zmm17) - zmm11
vfmsub231pd     zmm13, zmm0, zmm18      # zmm13 = (zmm0 * zmm18) - zmm13
vfmsub231pd     zmm10, zmm1, zmm8       # zmm10 = (zmm1 * zmm8) - zmm10
vfmsub231pd     zmm11, zmm1, zmm8       # zmm11 = (zmm1 * zmm8) - zmm11
vfmsub231pd     zmm13, zmm1, zmm8       # zmm13 = (zmm1 * zmm8) - zmm13
vmovupd zmmword ptr [rax + 8*r15 - 128], zmm10
vmovupd zmmword ptr [rax + 8*r15 - 64], zmm11
vmovupd zmmword ptr [rax + 8*r15], zmm13
cmp     r15, rsi
jle     .LBB0_3
``````

foo2:

``````.LBB0_3:                                # %L82
#   Parent Loop BB0_2 Depth=1
# =>  This Inner Loop Header: Depth=2
vmovupd zmm12, zmmword ptr [rcx + 8*r15 - 80]
vmovupd zmm13, zmmword ptr [rcx + 8*r15 - 16]
vmovupd zmm14, zmmword ptr [rcx + 8*r15 - 152]
vmovupd zmm15, zmmword ptr [rcx + 8*r15 - 144]
vmovupd zmm16, zmmword ptr [rcx + 8*r15 - 128]
vmovupd zmm17, zmmword ptr [rcx + 8*r15 - 88]
vmovupd zmm18, zmmword ptr [rcx + 8*r15 - 24]
vaddpd  zmm14, zmm14, zmmword ptr [rcx + 8*r15 - 136]
vaddpd  zmm17, zmm17, zmmword ptr [rcx + 8*r15 - 72]
vaddpd  zmm18, zmm18, zmmword ptr [rcx + 8*r15 - 8]
vmovupd zmm19, zmmword ptr [rcx + 8*r15 - 64]
vmovupd zmm20, zmmword ptr [rcx + 8*r15]
vaddpd  zmm16, zmm16, zmmword ptr [rcx + 8*r15 - 160]
vaddpd  zmm19, zmm19, zmmword ptr [rcx + 8*r15 - 96]
vaddpd  zmm20, zmm20, zmmword ptr [rcx + 8*r15 - 32]
vfmadd231pd     zmm16, zmm15, zmm2      # zmm16 = (zmm15 * zmm2) + zmm16
vfmadd231pd     zmm19, zmm12, zmm2      # zmm19 = (zmm12 * zmm2) + zmm19
vfmadd231pd     zmm20, zmm13, zmm2      # zmm20 = (zmm13 * zmm2) + zmm20
vfnmadd231pd    zmm16, zmm3, zmm14      # zmm16 = -(zmm3 * zmm14) + zmm16
vfnmadd231pd    zmm19, zmm3, zmm17      # zmm19 = -(zmm3 * zmm17) + zmm19
vfnmadd231pd    zmm20, zmm3, zmm18      # zmm20 = -(zmm3 * zmm18) + zmm20
vfmsub213pd     zmm16, zmm0, zmmword ptr [r13 + 8*r15 + 16] # zmm16 = (zmm0 * zmm16) - mem
vfmsub213pd     zmm19, zmm0, zmmword ptr [r13 + 8*r15 + 80] # zmm19 = (zmm0 * zmm19) - mem
vfmsub213pd     zmm20, zmm0, zmmword ptr [r13 + 8*r15 + 144] # zmm20 = (zmm0 * zmm20) - mem
vfnmadd231pd    zmm16, zmm1, zmm4       # zmm16 = -(zmm1 * zmm4) + zmm16
vfnmadd231pd    zmm19, zmm1, zmm4       # zmm19 = -(zmm1 * zmm4) + zmm19
vfnmadd231pd    zmm20, zmm1, zmm4       # zmm20 = -(zmm1 * zmm4) + zmm20
vfnmadd231pd    zmm16, zmm5, zmmword ptr [rbp + 8*r15 - 128] # zmm16 = -(zmm5 * mem) + zmm16
vfnmadd231pd    zmm19, zmm5, zmmword ptr [rbp + 8*r15 - 64] # zmm19 = -(zmm5 * mem) + zmm19
vfnmadd231pd    zmm20, zmm5, zmmword ptr [rbp + 8*r15] # zmm20 = -(zmm5 * mem) + zmm20
vfnmadd231pd    zmm16, zmm6, zmmword ptr [rbx + 8*r15 - 128] # zmm16 = -(zmm6 * mem) + zmm16
vfnmadd231pd    zmm19, zmm6, zmmword ptr [rbx + 8*r15 - 64] # zmm19 = -(zmm6 * mem) + zmm19
vfnmadd231pd    zmm20, zmm6, zmmword ptr [rbx + 8*r15] # zmm20 = -(zmm6 * mem) + zmm20
vfmsub231pd     zmm16, zmm7, zmm15      # zmm16 = (zmm7 * zmm15) - zmm16
vmulpd  zmm14, zmm16, zmm11
vfmsub231pd     zmm19, zmm7, zmm12      # zmm19 = (zmm7 * zmm12) - zmm19
vfmsub231pd     zmm20, zmm7, zmm13      # zmm20 = (zmm7 * zmm13) - zmm20
vmulpd  zmm12, zmm19, zmm9
vmulpd  zmm13, zmm20, zmm9
vmovupd zmmword ptr [rax + 8*r15 - 128], zmm14
vmovupd zmmword ptr [rax + 8*r15 - 64], zmm12
vmovupd zmmword ptr [rax + 8*r15], zmm13