My benchmark was on Windows, was yours too? On Fedora using the same device, the results reverse, but the second function is much slower than on Windows.
julia> @code_native debuginfo=:none g(M1, SM1)
.text
.file "g"
.globl julia_g_1297 # -- Begin function julia_g_1297
.p2align 4, 0x90
.type julia_g_1297,@function
julia_g_1297: # @julia_g_1297
.cfi_startproc
# %bb.0: # %top
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $192, %rsp
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
vxorpd %xmm0, %xmm0, %xmm0
vmovapd %ymm0, 96(%rsp)
#APP
movq %fs:0, %rax
#NO_APP
movq -8(%rax), %r12
movq $8, 96(%rsp)
movq (%r12), %rax
movq %rax, 104(%rsp)
leaq 96(%rsp), %rax
movq %rax, (%r12)
movq 24(%rdi), %rax
movq 32(%rdi), %rbx
movq (%rsi), %rdx
movq 24(%rdx), %r13
movq 32(%rdx), %r15
movq %rax, 16(%rsp) # 8-byte Spill
cmpq %r13, %rax
jne .LBB0_22
# %bb.1: # %top
cmpq %r15, %rbx
jne .LBB0_22
# %bb.2: # %L53
xorl %eax, %eax
cmpq $0, 16(%rsp) # 8-byte Folded Reload
je .LBB0_21
# %bb.3: # %L53
testq %rbx, %rbx
je .LBB0_21
# %bb.4: # %L73.preheader
movq %r12, 64(%rsp) # 8-byte Spill
movq (%rdi), %r11
movq %rsi, 56(%rsp) # 8-byte Spill
movl 8(%rsi), %edi
movq (%rdx), %r14
movq 16(%rsp), %r13 # 8-byte Reload
leaq 1(%r13), %rax
movq %rax, 48(%rsp) # 8-byte Spill
leaq (,%r13,8), %rax
movq %rax, 72(%rsp) # 8-byte Spill
negq %r13
xorl %edx, %edx
xorl %r10d, %r10d
movl $1, %ecx
xorl %eax, %eax
movq %rbx, 80(%rsp) # 8-byte Spill
jmp .LBB0_5
.p2align 4, 0x90
.LBB0_19: # %L154
# in Loop: Header=BB0_5 Depth=1
movq 24(%rsp), %rsi # 8-byte Reload
leaq 1(%rsi), %rcx
movq 88(%rsp), %r10 # 8-byte Reload
incq %r10
addq 72(%rsp), %r11 # 8-byte Folded Reload
decq %rdx
movq 80(%rsp), %rbx # 8-byte Reload
cmpq %rbx, %rsi
je .LBB0_20
.LBB0_5: # %ib.lr.ph.split.us
# =>This Loop Header: Depth=1
# Child Loop BB0_9 Depth 2
movq %rcx, 24(%rsp) # 8-byte Spill
leaq -1(%rcx), %r9
cmpq %rbx, %r9
jae .LBB0_6
# %bb.8: # %idxend.us.preheader
# in Loop: Header=BB0_5 Depth=1
imulq 16(%rsp), %r9 # 8-byte Folded Reload
movl $1, %r15d
movq %r10, 88(%rsp) # 8-byte Spill
xorl %r8d, %r8d
.p2align 4, 0x90
.LBB0_9: # %idxend.us
# Parent Loop BB0_5 Depth=1
# => This Inner Loop Header: Depth=2
leaq (%r15,%r13), %r12
cmpq $1, %r12
je .LBB0_10
# %bb.11: # %L107.us
# in Loop: Header=BB0_9 Depth=2
leaq (%rdx,%r15), %rsi
addq %r9, %r8
vmovsd -8(%r11,%r15,8), %xmm0 # xmm0 = mem[0],zero
cmpq $1, %rsi
jne .LBB0_12
# %bb.14: # %L109.us
# in Loop: Header=BB0_9 Depth=2
cmpl $1426063360, %edi # imm = 0x55000000
je .LBB0_16
# %bb.15: # %L109.us
# in Loop: Header=BB0_9 Depth=2
cmpl $1275068416, %edi # imm = 0x4C000000
je .LBB0_16
jmp .LBB0_18
.p2align 4, 0x90
.LBB0_12: # %L125.us
# in Loop: Header=BB0_9 Depth=2
cmpl $1426063360, %edi # imm = 0x55000000
sete %bl
cmpq 24(%rsp), %r15 # 8-byte Folded Reload
setb %cl
cmpb %bl, %cl
je .LBB0_16
# %bb.13: # %L135.us
# in Loop: Header=BB0_9 Depth=2
movq %r10, %r8
.LBB0_16: # %L141.us
# in Loop: Header=BB0_9 Depth=2
vcmpeqsd (%r14,%r8,8), %xmm0, %k0
kmovw %k0, %ecx
addq %rcx, %rax
testq %r12, %r12
je .LBB0_19
# %bb.17: # %L73.us
# in Loop: Header=BB0_9 Depth=2
movq %r15, %r8
incq %r15
addq 16(%rsp), %r10 # 8-byte Folded Reload
jmp .LBB0_9
.LBB0_20: # %L186.loopexit
movq 16(%rsp), %rcx # 8-byte Reload
movq %rcx, 32(%rsp)
movq %rbx, 40(%rsp)
movq 64(%rsp), %r12 # 8-byte Reload
.LBB0_21: # %L186
movq 104(%rsp), %rcx
movq %rcx, (%r12)
leaq -40(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
.cfi_def_cfa %rsp, 8
vzeroupper
retq
.LBB0_10:
.cfi_def_cfa %rbp, 16
movq 48(%rsp), %rax # 8-byte Reload
.LBB0_7: # %L104
movq %rax, 32(%rsp)
movq 24(%rsp), %rax # 8-byte Reload
movq %rax, 40(%rsp)
movabsq $j_throw_boundserror_1300, %rcx
leaq 32(%rsp), %rsi
movq 56(%rsp), %rdi # 8-byte Reload
vzeroupper
callq *%rcx
.LBB0_18: # %L122
movabsq $j_throw_uplo_1299, %rax
vzeroupper
callq *%rax
.LBB0_6:
movl $1, %eax
jmp .LBB0_7
.LBB0_22: # %L31
movq 16(%r12), %rdi
movabsq $ijl_gc_pool_alloc, %r14
movl $1440, %esi # imm = 0x5A0
movl $32, %edx
vzeroupper
callq *%r14
movq %r14, %rcx
movq %rax, %r14
movabsq $140054640149440, %rax # imm = 0x7F61031343C0
movq %rax, -8(%r14)
movq 16(%rsp), %rax # 8-byte Reload
movq %rax, (%r14)
movq %rbx, 8(%r14)
movq %r14, 120(%rsp)
movq 16(%r12), %rdi
movl $1440, %esi # imm = 0x5A0
movl $32, %edx
callq *%rcx
movabsq $140054640149440, %rcx # imm = 0x7F61031343C0
movq %rcx, -8(%rax)
movq %r13, (%rax)
movq %r15, 8(%rax)
movq %rax, 112(%rsp)
movabsq $140054646876480, %rcx # imm = 0x7F610379E940
movq %rcx, 144(%rsp)
movq %r14, 152(%rsp)
movq %rax, 160(%rsp)
movabsq $ijl_invoke, %rax
movabsq $140054721237520, %rdi # imm = 0x7F6107E89210
leaq 144(%rsp), %rsi
movabsq $140054833311232, %rcx # imm = 0x7F610E96AE00
movl $3, %edx
callq *%rax
ud2
.Lfunc_end0:
.size julia_g_1297, .Lfunc_end0-julia_g_1297
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits
julia> @code_native debuginfo=:none g(SM1, M1)
.text
.file "g"
.section .rodata.cst16,"aM",@progbits,16
.p2align 4 # -- Begin function julia_g_1301
.LCPI0_0:
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.section .rodata.cst32,"aM",@progbits,32
.p2align 5
.LCPI0_1:
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.LCPI0_2:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.section .rodata.cst8,"aM",@progbits,8
.p2align 3
.LCPI0_3:
.quad 4 # 0x4
.LCPI0_4:
.quad 8 # 0x8
.LCPI0_5:
.quad 12 # 0xc
.LCPI0_6:
.quad 16 # 0x10
.text
.globl julia_g_1301
.p2align 4, 0x90
.type julia_g_1301,@function
julia_g_1301: # @julia_g_1301
.cfi_startproc
# %bb.0: # %top
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $192, %rsp
movq %rsp, %rbx
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
vxorps %xmm0, %xmm0, %xmm0
vmovaps %ymm0, 96(%rbx)
#APP
movq %fs:0, %rax
#NO_APP
movq -8(%rax), %r12
movq $8, 96(%rbx)
movq (%r12), %rax
movq %rax, 104(%rbx)
leaq 96(%rbx), %rax
movq %rax, (%r12)
movq (%rdi), %rdx
vmovdqu 24(%rdx), %xmm0
movq 24(%rsi), %r14
movq 32(%rsi), %r13
vmovq %xmm0, %rax
cmpq %r14, %rax
jne .LBB0_41
# %bb.1: # %top
vpextrq $1, %xmm0, %rax
cmpq %r13, %rax
jne .LBB0_41
# %bb.2: # %L53
movq %r13, 16(%rbx) # 8-byte Spill
movq %r12, 64(%rbx) # 8-byte Spill
vptestnmq %xmm0, %xmm0, %k0
xorl %eax, %eax
kortestb %k0, %k0
je .LBB0_3
.LBB0_32: # %L186
movq 104(%rbx), %rcx
movq 64(%rbx), %rdx # 8-byte Reload
movq %rcx, (%rdx)
leaq -40(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
.cfi_def_cfa %rsp, 8
vzeroupper
retq
.LBB0_3: # %L73.preheader
.cfi_def_cfa %rbp, 16
movl 8(%rdi), %r10d
movq (%rdx), %rdi
movq %rsi, 56(%rbx) # 8-byte Spill
movq (%rsi), %r8
leaq 1(%r14), %r9
leaq -1(%r14), %rcx
cmpq %rcx, %r14
movq %rcx, 88(%rbx) # 8-byte Spill
cmovbq %r14, %rcx
movq %rcx, 80(%rbx) # 8-byte Spill
leaq 1(%rcx), %rcx
movq %rcx, 72(%rbx) # 8-byte Spill
leaq 96(%r8), %rcx
movq %rcx, 24(%rbx) # 8-byte Spill
movabsq $.LCPI0_0, %rcx
vmovdqa64 (%rcx), %xmm28
movabsq $.LCPI0_1, %rcx
vmovdqa64 (%rcx), %ymm26
movabsq $.LCPI0_2, %rcx
vmovdqa64 (%rcx), %ymm27
movabsq $.LCPI0_3, %rcx
vpbroadcastq (%rcx), %ymm3
movabsq $.LCPI0_4, %rcx
vpbroadcastq (%rcx), %ymm4
movabsq $.LCPI0_5, %rcx
vpbroadcastq (%rcx), %ymm5
movabsq $.LCPI0_6, %rcx
vpbroadcastq (%rcx), %ymm6
vpbroadcastq %r14, %ymm7
vpcmpeqd %xmm1, %xmm1, %xmm1
movq 16(%rbx), %rsi # 8-byte Reload
movl %r10d, 32(%rbx) # 4-byte Spill
jmp .LBB0_4
.p2align 4, 0x90
.LBB0_31: # %L154
# in Loop: Header=BB0_4 Depth=1
vpsubq %xmm1, %xmm28, %xmm28
cmpq %rsi, %r15
je .LBB0_32
.LBB0_4: # %L106.lr.ph
# =>This Loop Header: Depth=1
# Child Loop BB0_10 Depth 2
# Child Loop BB0_12 Depth 2
# Child Loop BB0_21 Depth 2
# Child Loop BB0_23 Depth 2
vmovq %xmm28, %rdx
cmpq %rdx, %r14
cmovbq %r14, %rdx
movq 88(%rbx), %rcx # 8-byte Reload
cmpq %rcx, %rdx
cmovaeq %rcx, %rdx
vpextrq $1, %xmm28, %r15
leaq -1(%r15), %r13
movq %r14, %r12
imulq %r13, %r12
cmpl $1275068416, %r10d # imm = 0x4C000000
je .LBB0_16
# %bb.5: # %L106.lr.ph
# in Loop: Header=BB0_4 Depth=1
cmpl $1426063360, %r10d # imm = 0x55000000
jne .LBB0_6
.LBB0_16: # %L106.lr.ph.split.us
# in Loop: Header=BB0_4 Depth=1
cmpq %rsi, %r13
jae .LBB0_17
# %bb.18: # %L106.us.us.preheader
# in Loop: Header=BB0_4 Depth=1
cmpq $16, 80(%rbx) # 8-byte Folded Reload
jae .LBB0_20
# %bb.19: # in Loop: Header=BB0_4 Depth=1
movl $1, %r10d
xorl %r11d, %r11d
jmp .LBB0_23
.p2align 4, 0x90
.LBB0_6: # %L106.lr.ph.L106.lr.ph.split_crit_edge
# in Loop: Header=BB0_4 Depth=1
cmpq %rsi, %r13
jae .LBB0_36
# %bb.7: # %L106.us85.preheader
# in Loop: Header=BB0_4 Depth=1
cmpq $16, %rdx
jae .LBB0_9
# %bb.8: # in Loop: Header=BB0_4 Depth=1
movl $1, %esi
xorl %ecx, %ecx
jmp .LBB0_12
.p2align 4, 0x90
.LBB0_20: # %vector.ph334
# in Loop: Header=BB0_4 Depth=1
movq 72(%rbx), %rdx # 8-byte Reload
movl %edx, %ecx
andl $15, %ecx
testq %rcx, %rcx
movl $16, %esi
cmoveq %rsi, %rcx
movq %rdx, %r11
subq %rcx, %r11
negq %rcx
cmpl $1426063360, %r10d # imm = 0x55000000
leaq 1(%rdx,%rcx), %r10
vmovq %rax, %xmm9
vpermq $85, %ymm28, %ymm10 # ymm10 = ymm28[1,1,1,1]
movl $0, %eax
movl $255, %ecx
cmovel %ecx, %eax
kmovd %eax, %k0
vpbroadcastq %r12, %ymm11
vpbroadcastq %r13, %ymm12
movq 24(%rbx), %rax # 8-byte Reload
leaq (%rax,%r12,8), %rax
vpxor %xmm13, %xmm13, %xmm13
xorl %ecx, %ecx
vmovdqa64 %ymm27, %ymm14
vpxor %xmm15, %xmm15, %xmm15
vpxord %xmm16, %xmm16, %xmm16
vmovdqa64 %ymm26, %ymm17
.p2align 4, 0x90
.LBB0_21: # %vector.body332
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
vpaddq %ymm3, %ymm14, %ymm0
vpaddq %ymm4, %ymm14, %ymm1
vpaddq %ymm5, %ymm14, %ymm2
vpaddq %ymm3, %ymm17, %ymm8
vpaddq %ymm4, %ymm17, %ymm18
vpaddq %ymm5, %ymm17, %ymm19
vpcmpltuq %ymm10, %ymm17, %k1
kxorw %k0, %k1, %k1
vpaddq %ymm11, %ymm14, %ymm20
vpmullq %ymm14, %ymm7, %ymm21
vpcmpneqq %ymm10, %ymm17, %k1 {%k1}
vmovdqa64 %ymm20, %ymm22
vpaddq %ymm21, %ymm12, %ymm22 {%k1}
vpcmpltuq %ymm10, %ymm8, %k1
vpaddq %ymm11, %ymm0, %ymm21
vpmullq %ymm0, %ymm7, %ymm0
kxorw %k0, %k1, %k1
vpcmpneqq %ymm10, %ymm8, %k1 {%k1}
vmovdqa64 %ymm21, %ymm23
vpaddq %ymm0, %ymm12, %ymm23 {%k1}
vpcmpltuq %ymm10, %ymm18, %k1
vpmullq %ymm1, %ymm7, %ymm0
kxorw %k0, %k1, %k1
vpaddq %ymm1, %ymm11, %ymm1
vpcmpneqq %ymm10, %ymm18, %k1 {%k1}
vmovdqa64 %ymm1, %ymm24
vpaddq %ymm0, %ymm12, %ymm24 {%k1}
vpcmpltuq %ymm10, %ymm19, %k1
kxorw %k0, %k1, %k1
vpaddq %ymm2, %ymm11, %ymm0
vpmullq %ymm2, %ymm7, %ymm2
vpcmpneqq %ymm10, %ymm19, %k1 {%k1}
vmovdqa64 %ymm0, %ymm25
vpaddq %ymm2, %ymm12, %ymm25 {%k1}
vpcmpeqq %ymm10, %ymm17, %k1
vmovdqa64 %ymm20, %ymm22 {%k1}
vpcmpeqq %ymm10, %ymm8, %k1
vmovdqa64 %ymm21, %ymm23 {%k1}
vpxor %xmm2, %xmm2, %xmm2
kxnorw %k0, %k0, %k1
vpxor %xmm8, %xmm8, %xmm8
vgatherqpd (%rdi,%ymm22,8), %ymm2 {%k1}
kxnorw %k0, %k0, %k1
vgatherqpd (%rdi,%ymm23,8), %ymm8 {%k1}
vpcmpeqq %ymm10, %ymm18, %k1
vmovdqa64 %ymm1, %ymm24 {%k1}
vpcmpeqq %ymm10, %ymm19, %k1
vmovdqa64 %ymm0, %ymm25 {%k1}
vpxor %xmm0, %xmm0, %xmm0
kxnorw %k0, %k0, %k1
vpxor %xmm1, %xmm1, %xmm1
vgatherqpd (%rdi,%ymm24,8), %ymm0 {%k1}
kxnorw %k0, %k0, %k1
vgatherqpd (%rdi,%ymm25,8), %ymm1 {%k1}
vcmpeqpd -96(%rax,%rcx,8), %ymm2, %ymm2
vcmpeqpd -64(%rax,%rcx,8), %ymm8, %ymm8
vpsubq %ymm2, %ymm9, %ymm9
vcmpeqpd -32(%rax,%rcx,8), %ymm0, %ymm0
vcmpeqpd (%rax,%rcx,8), %ymm1, %ymm1
vpsubq %ymm8, %ymm13, %ymm13
vpsubq %ymm0, %ymm15, %ymm15
vpsubq %ymm1, %ymm16, %ymm16
addq $16, %rcx
vpaddq %ymm6, %ymm14, %ymm14
vpaddq %ymm6, %ymm17, %ymm17
cmpq %rcx, %r11
jne .LBB0_21
# %bb.22: # %middle.block330
# in Loop: Header=BB0_4 Depth=1
vpaddq %ymm9, %ymm13, %ymm0
vpaddq %ymm0, %ymm15, %ymm0
vpaddq %ymm0, %ymm16, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddq %xmm1, %xmm0, %xmm0
vpshufd $238, %xmm0, %xmm1 # xmm1 = xmm0[2,3,2,3]
vpaddq %xmm1, %xmm0, %xmm0
vmovq %xmm0, %rax
vpcmpeqd %xmm1, %xmm1, %xmm1
movq 16(%rbx), %rsi # 8-byte Reload
jmp .LBB0_23
.p2align 4, 0x90
.LBB0_33: # %L73.us.us
# in Loop: Header=BB0_23 Depth=2
movq %r10, %r11
incq %r10
.LBB0_23: # %L106.us.us
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
cmpq %r10, %r15
jne .LBB0_24
# %bb.27: # %L108.us.us
# in Loop: Header=BB0_23 Depth=2
leaq (%r11,%r12), %rcx
leaq (%rdi,%rcx,8), %rcx
jmp .LBB0_28
.p2align 4, 0x90
.LBB0_24: # %L124.us.us
# in Loop: Header=BB0_23 Depth=2
cmpl $1426063360, 32(%rbx) # 4-byte Folded Reload
# imm = 0x55000000
sete %cl
cmpq %r15, %r10
setb %dl
xorb %cl, %dl
je .LBB0_25
# %bb.26: # %L134.us.us
# in Loop: Header=BB0_23 Depth=2
movq %r14, %rcx
imulq %r11, %rcx
addq %r13, %rcx
leaq (%rdi,%rcx,8), %rcx
jmp .LBB0_28
.p2align 4, 0x90
.LBB0_25: # %L131.us.us
# in Loop: Header=BB0_23 Depth=2
leaq (%r11,%r12), %rcx
leaq (%rdi,%rcx,8), %rcx
.LBB0_28: # %L140.us.us
# in Loop: Header=BB0_23 Depth=2
cmpq %r10, %r9
je .LBB0_37
# %bb.29: # %idxend.us.us
# in Loop: Header=BB0_23 Depth=2
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
addq %r12, %r11
vcmpeqsd (%r8,%r11,8), %xmm0, %k0
kmovw %k0, %ecx
addq %rcx, %rax
cmpq %r10, %r14
jne .LBB0_33
# %bb.30: # in Loop: Header=BB0_4 Depth=1
movl 32(%rbx), %r10d # 4-byte Reload
jmp .LBB0_31
.LBB0_9: # %vector.ph
# in Loop: Header=BB0_4 Depth=1
incq %rdx
movl %edx, %esi
andl $15, %esi
testq %rsi, %rsi
movl $16, %ecx
cmoveq %rcx, %rsi
movq %rdx, %rcx
subq %rsi, %rcx
negq %rsi
addq %rdx, %rsi
incq %rsi
vmovq %rax, %xmm9
vpermq $85, %ymm28, %ymm10 # ymm10 = ymm28[1,1,1,1]
vpbroadcastq %r12, %ymm11
vpbroadcastq %r13, %ymm12
movq 24(%rbx), %rax # 8-byte Reload
leaq (%rax,%r12,8), %rax
vpxor %xmm13, %xmm13, %xmm13
xorl %edx, %edx
vmovdqa64 %ymm27, %ymm14
vpxor %xmm15, %xmm15, %xmm15
vpxord %xmm16, %xmm16, %xmm16
vmovdqa64 %ymm26, %ymm17
.p2align 4, 0x90
.LBB0_10: # %vector.body
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
vpaddq %ymm3, %ymm14, %ymm18
vpaddq %ymm4, %ymm14, %ymm19
vpaddq %ymm5, %ymm14, %ymm20
vpaddq %ymm3, %ymm17, %ymm21
vpaddq %ymm4, %ymm17, %ymm22
vpaddq %ymm5, %ymm17, %ymm23
vpcmpltuq %ymm10, %ymm17, %k3
vpcmpltuq %ymm10, %ymm21, %k4
vpcmpltuq %ymm10, %ymm22, %k2
vpcmpltuq %ymm10, %ymm23, %k1
vpaddq %ymm11, %ymm14, %ymm21
vpaddq %ymm11, %ymm18, %ymm22
vpmullq %ymm14, %ymm7, %ymm23
vpaddq %ymm11, %ymm19, %ymm24
vpmullq %ymm18, %ymm7, %ymm18
vpmullq %ymm19, %ymm7, %ymm19
vpaddq %ymm11, %ymm20, %ymm25
vpmullq %ymm20, %ymm7, %ymm20
vpaddq %ymm23, %ymm12, %ymm21 {%k3}
vpaddq %ymm18, %ymm12, %ymm22 {%k4}
vpaddq %ymm19, %ymm12, %ymm24 {%k2}
vxorpd %xmm8, %xmm8, %xmm8
kxnorw %k0, %k0, %k2
vpxor %xmm1, %xmm1, %xmm1
kxnorw %k0, %k0, %k3
vpaddq %ymm20, %ymm12, %ymm25 {%k1}
vxorpd %xmm2, %xmm2, %xmm2
vgatherqpd (%rdi,%ymm21,8), %ymm8 {%k2}
kxnorw %k0, %k0, %k1
vgatherqpd (%rdi,%ymm22,8), %ymm1 {%k3}
vpxor %xmm0, %xmm0, %xmm0
vgatherqpd (%rdi,%ymm24,8), %ymm2 {%k1}
kxnorw %k0, %k0, %k1
vgatherqpd (%rdi,%ymm25,8), %ymm0 {%k1}
vcmpeqpd -96(%rax,%rdx,8), %ymm8, %ymm8
vcmpeqpd -64(%rax,%rdx,8), %ymm1, %ymm1
vpsubq %ymm8, %ymm9, %ymm9
vcmpeqpd -32(%rax,%rdx,8), %ymm2, %ymm2
vcmpeqpd (%rax,%rdx,8), %ymm0, %ymm0
vpsubq %ymm1, %ymm13, %ymm13
vpsubq %ymm2, %ymm15, %ymm15
vpsubq %ymm0, %ymm16, %ymm16
addq $16, %rdx
vpaddq %ymm6, %ymm14, %ymm14
vpaddq %ymm6, %ymm17, %ymm17
cmpq %rdx, %rcx
jne .LBB0_10
# %bb.11: # %middle.block
# in Loop: Header=BB0_4 Depth=1
vpaddq %ymm9, %ymm13, %ymm0
vpaddq %ymm0, %ymm15, %ymm0
vpaddq %ymm0, %ymm16, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddq %xmm1, %xmm0, %xmm0
vpshufd $238, %xmm0, %xmm1 # xmm1 = xmm0[2,3,2,3]
vpaddq %xmm1, %xmm0, %xmm0
vmovq %xmm0, %rax
vpcmpeqd %xmm1, %xmm1, %xmm1
jmp .LBB0_12
.p2align 4, 0x90
.LBB0_34: # %L73.us103
# in Loop: Header=BB0_12 Depth=2
movq %rsi, %rcx
incq %rsi
.LBB0_12: # %L106.us85
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
cmpq %rsi, %r15
je .LBB0_35
# %bb.13: # %L124.us91
# in Loop: Header=BB0_12 Depth=2
cmpq %rsi, %r9
je .LBB0_37
# %bb.14: # %idxend.us100
# in Loop: Header=BB0_12 Depth=2
movq %r14, %rdx
imulq %rcx, %rdx
addq %r12, %rcx
addq %r13, %rdx
cmpq %r15, %rsi
cmovaeq %rcx, %rdx
vmovsd (%rdi,%rdx,8), %xmm0 # xmm0 = mem[0],zero
vcmpeqsd (%r8,%rcx,8), %xmm0, %k0
kmovw %k0, %ecx
addq %rcx, %rax
cmpq %rsi, %r14
jne .LBB0_34
# %bb.15: # in Loop: Header=BB0_4 Depth=1
movq 16(%rbx), %rsi # 8-byte Reload
jmp .LBB0_31
.LBB0_17:
movl $1, %r9d
.LBB0_37: # %oob
movq %rsp, %rax
movl $16, %ecx
subq %rcx, %rax
movq 56(%rbx), %rdi # 8-byte Reload
cmpq %rsp, %rax
jge .LBB0_40
.LBB0_39: # %oob
# =>This Inner Loop Header: Depth=1
xorq $0, (%rsp)
subq $4096, %rsp # imm = 0x1000
cmpq %rsp, %rax
jl .LBB0_39
.LBB0_40: # %oob
movq %rax, %rsp
movq %r9, (%rax)
movq %r15, 8(%rax)
movabsq $ijl_bounds_error_ints, %rcx
movl $2, %edx
movq %rax, %rsi
vzeroupper
callq *%rcx
.LBB0_36: # %L106.lr.ph.split.L106.lr.ph.split.split_crit_edge
movl $1, %r9d
cmpq $1, %r15
jne .LBB0_37
.LBB0_35: # %L121
movabsq $j_throw_uplo_1303, %rax
vzeroupper
callq *%rax
.LBB0_41: # %L31
movq 16(%r12), %rdi
movabsq $ijl_gc_pool_alloc, %r15
movl $1440, %esi # imm = 0x5A0
movl $32, %edx
vmovdqa %xmm0, 32(%rbx) # 16-byte Spill
vzeroupper
callq *%r15
movq %r15, %rcx
movq %rax, %r15
movabsq $140054640149440, %rax # imm = 0x7F61031343C0
movq %rax, -8(%r15)
vmovaps 32(%rbx), %xmm0 # 16-byte Reload
vmovups %xmm0, (%r15)
movq %r15, 120(%rbx)
movq 16(%r12), %rdi
movl $1440, %esi # imm = 0x5A0
movl $32, %edx
callq *%rcx
movabsq $140054640149440, %rcx # imm = 0x7F61031343C0
movq %rcx, -8(%rax)
movq %r14, (%rax)
movq %r13, 8(%rax)
movq %rax, 112(%rbx)
movabsq $140054646876480, %rcx # imm = 0x7F610379E940
movq %rcx, 144(%rbx)
movq %r15, 152(%rbx)
movq %rax, 160(%rbx)
movabsq $ijl_invoke, %rax
movabsq $140054721237520, %rdi # imm = 0x7F6107E89210
leaq 144(%rbx), %rsi
movabsq $140054833311232, %rcx # imm = 0x7F610E96AE00
movl $3, %edx
callq *%rax
ud2
.Lfunc_end0:
.size julia_g_1301, .Lfunc_end0-julia_g_1301
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits