Performance difference depending on order of comparison in loop

Unexpectedly, the following function has very different performance depending on the argument order of two matrices, which should be equivalent

function f2(x, y; init=0)
    total = init
    for i in eachindex(x, y)
        total += x[i] == y[i]
    end
    return total
end

The two arguments are

M1 = rand(100, 100); SM1 = Symmetric(M1);

My benchmarks give

julia> @btime f2($M1, $SM1);
  12.600 μs (0 allocations: 0 bytes)

julia> @btime f2($SM1, $M1);
  4.357 μs (0 allocations: 0 bytes)

julia> versioninfo()
Julia Version 1.9.4
Commit 8e5136fa29 (2023-11-14 08:46 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, tigerlake)
  Threads: 8 on 8 virtual cores
Environment:
  JULIA_CONDAPKG_BACKEND = Null
  JULIA_NUM_THREADS = auto

Since eachindex gives the same CartesianIndices((100, 100)), I think the arrays are traversed in the same order, so I don’t understand why there’s such a significant performance differential. Can anyone shed some light on this?

Weirdly enough, on my laptop the second is marginally slower

julia> @btime f2($M1, $SM1);
  33.884 μs (0 allocations: 0 bytes)

julia> @btime f2($SM1, $M1);
  37.717 μs (0 allocations: 0 bytes)

I get

julia> @btime f2($M1, $SM1);
  15.900 μs (0 allocations: 0 bytes)

julia> @btime f2($SM1, $M1);
  23.700 μs (0 allocations: 0 bytes)

julia> versioninfo()
Julia Version 1.9.1
Commit 147bdf428c (2023-06-07 08:27 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 4 × Intel(R) Core(TM) i5-4670K CPU @ 3.40GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, haswell)
  Threads: 1 on 4 virtual cores

My benchmark was on Windows, was yours too? On Fedora using the same device, the results reverse, but the second function is much slower than on Windows.

julia> @btime f2($M1, $SM1);
  12.582 μs (0 allocations: 0 bytes)

julia> @btime f2($SM1, $M1);
  16.939 μs (0 allocations: 0 bytes)

EDIT: I tried @code_native on Fedora but it was just calling another function so I simplified it.

g(SM1, M1) seems to use a lot more vector instructions

julia> function g(x, y)
           total = 0
           for i in eachindex(x, y)
              total += x[i] == y[i]
           end
           return total
       end

julia> versioninfo()
Julia Version 1.9.4
Commit 8e5136fa297 (2023-11-14 08:46 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, tigerlake)
  Threads: 8 on 8 virtual cores
Environment:
  JULIA_NUM_THREADS = auto

For @code_native debuginfo=:none g(M1, SM1)

julia> @code_native debuginfo=:none g(M1, SM1)
	.text
	.file	"g"
	.globl	julia_g_1297                    # -- Begin function julia_g_1297
	.p2align	4, 0x90
	.type	julia_g_1297,@function
julia_g_1297:                           # @julia_g_1297
	.cfi_startproc
# %bb.0:                                # %top
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	andq	$-32, %rsp
	subq	$192, %rsp
	.cfi_offset %rbx, -56
	.cfi_offset %r12, -48
	.cfi_offset %r13, -40
	.cfi_offset %r14, -32
	.cfi_offset %r15, -24
	vxorpd	%xmm0, %xmm0, %xmm0
	vmovapd	%ymm0, 96(%rsp)
	#APP
	movq	%fs:0, %rax
	#NO_APP
	movq	-8(%rax), %r12
	movq	$8, 96(%rsp)
	movq	(%r12), %rax
	movq	%rax, 104(%rsp)
	leaq	96(%rsp), %rax
	movq	%rax, (%r12)
	movq	24(%rdi), %rax
	movq	32(%rdi), %rbx
	movq	(%rsi), %rdx
	movq	24(%rdx), %r13
	movq	32(%rdx), %r15
	movq	%rax, 16(%rsp)                  # 8-byte Spill
	cmpq	%r13, %rax
	jne	.LBB0_22
# %bb.1:                                # %top
	cmpq	%r15, %rbx
	jne	.LBB0_22
# %bb.2:                                # %L53
	xorl	%eax, %eax
	cmpq	$0, 16(%rsp)                    # 8-byte Folded Reload
	je	.LBB0_21
# %bb.3:                                # %L53
	testq	%rbx, %rbx
	je	.LBB0_21
# %bb.4:                                # %L73.preheader
	movq	%r12, 64(%rsp)                  # 8-byte Spill
	movq	(%rdi), %r11
	movq	%rsi, 56(%rsp)                  # 8-byte Spill
	movl	8(%rsi), %edi
	movq	(%rdx), %r14
	movq	16(%rsp), %r13                  # 8-byte Reload
	leaq	1(%r13), %rax
	movq	%rax, 48(%rsp)                  # 8-byte Spill
	leaq	(,%r13,8), %rax
	movq	%rax, 72(%rsp)                  # 8-byte Spill
	negq	%r13
	xorl	%edx, %edx
	xorl	%r10d, %r10d
	movl	$1, %ecx
	xorl	%eax, %eax
	movq	%rbx, 80(%rsp)                  # 8-byte Spill
	jmp	.LBB0_5
	.p2align	4, 0x90
.LBB0_19:                               # %L154
                                        #   in Loop: Header=BB0_5 Depth=1
	movq	24(%rsp), %rsi                  # 8-byte Reload
	leaq	1(%rsi), %rcx
	movq	88(%rsp), %r10                  # 8-byte Reload
	incq	%r10
	addq	72(%rsp), %r11                  # 8-byte Folded Reload
	decq	%rdx
	movq	80(%rsp), %rbx                  # 8-byte Reload
	cmpq	%rbx, %rsi
	je	.LBB0_20
.LBB0_5:                                # %ib.lr.ph.split.us
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_9 Depth 2
	movq	%rcx, 24(%rsp)                  # 8-byte Spill
	leaq	-1(%rcx), %r9
	cmpq	%rbx, %r9
	jae	.LBB0_6
# %bb.8:                                # %idxend.us.preheader
                                        #   in Loop: Header=BB0_5 Depth=1
	imulq	16(%rsp), %r9                   # 8-byte Folded Reload
	movl	$1, %r15d
	movq	%r10, 88(%rsp)                  # 8-byte Spill
	xorl	%r8d, %r8d
	.p2align	4, 0x90
.LBB0_9:                                # %idxend.us
                                        #   Parent Loop BB0_5 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	leaq	(%r15,%r13), %r12
	cmpq	$1, %r12
	je	.LBB0_10
# %bb.11:                               # %L107.us
                                        #   in Loop: Header=BB0_9 Depth=2
	leaq	(%rdx,%r15), %rsi
	addq	%r9, %r8
	vmovsd	-8(%r11,%r15,8), %xmm0          # xmm0 = mem[0],zero
	cmpq	$1, %rsi
	jne	.LBB0_12
# %bb.14:                               # %L109.us
                                        #   in Loop: Header=BB0_9 Depth=2
	cmpl	$1426063360, %edi               # imm = 0x55000000
	je	.LBB0_16
# %bb.15:                               # %L109.us
                                        #   in Loop: Header=BB0_9 Depth=2
	cmpl	$1275068416, %edi               # imm = 0x4C000000
	je	.LBB0_16
	jmp	.LBB0_18
	.p2align	4, 0x90
.LBB0_12:                               # %L125.us
                                        #   in Loop: Header=BB0_9 Depth=2
	cmpl	$1426063360, %edi               # imm = 0x55000000
	sete	%bl
	cmpq	24(%rsp), %r15                  # 8-byte Folded Reload
	setb	%cl
	cmpb	%bl, %cl
	je	.LBB0_16
# %bb.13:                               # %L135.us
                                        #   in Loop: Header=BB0_9 Depth=2
	movq	%r10, %r8
.LBB0_16:                               # %L141.us
                                        #   in Loop: Header=BB0_9 Depth=2
	vcmpeqsd	(%r14,%r8,8), %xmm0, %k0
	kmovw	%k0, %ecx
	addq	%rcx, %rax
	testq	%r12, %r12
	je	.LBB0_19
# %bb.17:                               # %L73.us
                                        #   in Loop: Header=BB0_9 Depth=2
	movq	%r15, %r8
	incq	%r15
	addq	16(%rsp), %r10                  # 8-byte Folded Reload
	jmp	.LBB0_9
.LBB0_20:                               # %L186.loopexit
	movq	16(%rsp), %rcx                  # 8-byte Reload
	movq	%rcx, 32(%rsp)
	movq	%rbx, 40(%rsp)
	movq	64(%rsp), %r12                  # 8-byte Reload
.LBB0_21:                               # %L186
	movq	104(%rsp), %rcx
	movq	%rcx, (%r12)
	leaq	-40(%rbp), %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	.cfi_def_cfa %rsp, 8
	vzeroupper
	retq
.LBB0_10:
	.cfi_def_cfa %rbp, 16
	movq	48(%rsp), %rax                  # 8-byte Reload
.LBB0_7:                                # %L104
	movq	%rax, 32(%rsp)
	movq	24(%rsp), %rax                  # 8-byte Reload
	movq	%rax, 40(%rsp)
	movabsq	$j_throw_boundserror_1300, %rcx
	leaq	32(%rsp), %rsi
	movq	56(%rsp), %rdi                  # 8-byte Reload
	vzeroupper
	callq	*%rcx
.LBB0_18:                               # %L122
	movabsq	$j_throw_uplo_1299, %rax
	vzeroupper
	callq	*%rax
.LBB0_6:
	movl	$1, %eax
	jmp	.LBB0_7
.LBB0_22:                               # %L31
	movq	16(%r12), %rdi
	movabsq	$ijl_gc_pool_alloc, %r14
	movl	$1440, %esi                     # imm = 0x5A0
	movl	$32, %edx
	vzeroupper
	callq	*%r14
	movq	%r14, %rcx
	movq	%rax, %r14
	movabsq	$140054640149440, %rax          # imm = 0x7F61031343C0
	movq	%rax, -8(%r14)
	movq	16(%rsp), %rax                  # 8-byte Reload
	movq	%rax, (%r14)
	movq	%rbx, 8(%r14)
	movq	%r14, 120(%rsp)
	movq	16(%r12), %rdi
	movl	$1440, %esi                     # imm = 0x5A0
	movl	$32, %edx
	callq	*%rcx
	movabsq	$140054640149440, %rcx          # imm = 0x7F61031343C0
	movq	%rcx, -8(%rax)
	movq	%r13, (%rax)
	movq	%r15, 8(%rax)
	movq	%rax, 112(%rsp)
	movabsq	$140054646876480, %rcx          # imm = 0x7F610379E940
	movq	%rcx, 144(%rsp)
	movq	%r14, 152(%rsp)
	movq	%rax, 160(%rsp)
	movabsq	$ijl_invoke, %rax
	movabsq	$140054721237520, %rdi          # imm = 0x7F6107E89210
	leaq	144(%rsp), %rsi
	movabsq	$140054833311232, %rcx          # imm = 0x7F610E96AE00
	movl	$3, %edx
	callq	*%rax
	ud2
.Lfunc_end0:
	.size	julia_g_1297, .Lfunc_end0-julia_g_1297
	.cfi_endproc
                                        # -- End function
	.section	".note.GNU-stack","",@progbits

For @code_native debuginfo=:none g(SM1, M1),

julia> @code_native debuginfo=:none g(SM1, M1)
	.text
	.file	"g"
	.section	.rodata.cst16,"aM",@progbits,16
	.p2align	4                               # -- Begin function julia_g_1301
.LCPI0_0:
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	1                               # 0x1
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.byte	0                               # 0x0
	.section	.rodata.cst32,"aM",@progbits,32
	.p2align	5
.LCPI0_1:
	.quad	1                               # 0x1
	.quad	2                               # 0x2
	.quad	3                               # 0x3
	.quad	4                               # 0x4
.LCPI0_2:
	.quad	0                               # 0x0
	.quad	1                               # 0x1
	.quad	2                               # 0x2
	.quad	3                               # 0x3
	.section	.rodata.cst8,"aM",@progbits,8
	.p2align	3
.LCPI0_3:
	.quad	4                               # 0x4
.LCPI0_4:
	.quad	8                               # 0x8
.LCPI0_5:
	.quad	12                              # 0xc
.LCPI0_6:
	.quad	16                              # 0x10
	.text
	.globl	julia_g_1301
	.p2align	4, 0x90
	.type	julia_g_1301,@function
julia_g_1301:                           # @julia_g_1301
	.cfi_startproc
# %bb.0:                                # %top
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	andq	$-32, %rsp
	subq	$192, %rsp
	movq	%rsp, %rbx
	.cfi_offset %rbx, -56
	.cfi_offset %r12, -48
	.cfi_offset %r13, -40
	.cfi_offset %r14, -32
	.cfi_offset %r15, -24
	vxorps	%xmm0, %xmm0, %xmm0
	vmovaps	%ymm0, 96(%rbx)
	#APP
	movq	%fs:0, %rax
	#NO_APP
	movq	-8(%rax), %r12
	movq	$8, 96(%rbx)
	movq	(%r12), %rax
	movq	%rax, 104(%rbx)
	leaq	96(%rbx), %rax
	movq	%rax, (%r12)
	movq	(%rdi), %rdx
	vmovdqu	24(%rdx), %xmm0
	movq	24(%rsi), %r14
	movq	32(%rsi), %r13
	vmovq	%xmm0, %rax
	cmpq	%r14, %rax
	jne	.LBB0_41
# %bb.1:                                # %top
	vpextrq	$1, %xmm0, %rax
	cmpq	%r13, %rax
	jne	.LBB0_41
# %bb.2:                                # %L53
	movq	%r13, 16(%rbx)                  # 8-byte Spill
	movq	%r12, 64(%rbx)                  # 8-byte Spill
	vptestnmq	%xmm0, %xmm0, %k0
	xorl	%eax, %eax
	kortestb	%k0, %k0
	je	.LBB0_3
.LBB0_32:                               # %L186
	movq	104(%rbx), %rcx
	movq	64(%rbx), %rdx                  # 8-byte Reload
	movq	%rcx, (%rdx)
	leaq	-40(%rbp), %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	.cfi_def_cfa %rsp, 8
	vzeroupper
	retq
.LBB0_3:                                # %L73.preheader
	.cfi_def_cfa %rbp, 16
	movl	8(%rdi), %r10d
	movq	(%rdx), %rdi
	movq	%rsi, 56(%rbx)                  # 8-byte Spill
	movq	(%rsi), %r8
	leaq	1(%r14), %r9
	leaq	-1(%r14), %rcx
	cmpq	%rcx, %r14
	movq	%rcx, 88(%rbx)                  # 8-byte Spill
	cmovbq	%r14, %rcx
	movq	%rcx, 80(%rbx)                  # 8-byte Spill
	leaq	1(%rcx), %rcx
	movq	%rcx, 72(%rbx)                  # 8-byte Spill
	leaq	96(%r8), %rcx
	movq	%rcx, 24(%rbx)                  # 8-byte Spill
	movabsq	$.LCPI0_0, %rcx
	vmovdqa64	(%rcx), %xmm28
	movabsq	$.LCPI0_1, %rcx
	vmovdqa64	(%rcx), %ymm26
	movabsq	$.LCPI0_2, %rcx
	vmovdqa64	(%rcx), %ymm27
	movabsq	$.LCPI0_3, %rcx
	vpbroadcastq	(%rcx), %ymm3
	movabsq	$.LCPI0_4, %rcx
	vpbroadcastq	(%rcx), %ymm4
	movabsq	$.LCPI0_5, %rcx
	vpbroadcastq	(%rcx), %ymm5
	movabsq	$.LCPI0_6, %rcx
	vpbroadcastq	(%rcx), %ymm6
	vpbroadcastq	%r14, %ymm7
	vpcmpeqd	%xmm1, %xmm1, %xmm1
	movq	16(%rbx), %rsi                  # 8-byte Reload
	movl	%r10d, 32(%rbx)                 # 4-byte Spill
	jmp	.LBB0_4
	.p2align	4, 0x90
.LBB0_31:                               # %L154
                                        #   in Loop: Header=BB0_4 Depth=1
	vpsubq	%xmm1, %xmm28, %xmm28
	cmpq	%rsi, %r15
	je	.LBB0_32
.LBB0_4:                                # %L106.lr.ph
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_10 Depth 2
                                        #     Child Loop BB0_12 Depth 2
                                        #     Child Loop BB0_21 Depth 2
                                        #     Child Loop BB0_23 Depth 2
	vmovq	%xmm28, %rdx
	cmpq	%rdx, %r14
	cmovbq	%r14, %rdx
	movq	88(%rbx), %rcx                  # 8-byte Reload
	cmpq	%rcx, %rdx
	cmovaeq	%rcx, %rdx
	vpextrq	$1, %xmm28, %r15
	leaq	-1(%r15), %r13
	movq	%r14, %r12
	imulq	%r13, %r12
	cmpl	$1275068416, %r10d              # imm = 0x4C000000
	je	.LBB0_16
# %bb.5:                                # %L106.lr.ph
                                        #   in Loop: Header=BB0_4 Depth=1
	cmpl	$1426063360, %r10d              # imm = 0x55000000
	jne	.LBB0_6
.LBB0_16:                               # %L106.lr.ph.split.us
                                        #   in Loop: Header=BB0_4 Depth=1
	cmpq	%rsi, %r13
	jae	.LBB0_17
# %bb.18:                               # %L106.us.us.preheader
                                        #   in Loop: Header=BB0_4 Depth=1
	cmpq	$16, 80(%rbx)                   # 8-byte Folded Reload
	jae	.LBB0_20
# %bb.19:                               #   in Loop: Header=BB0_4 Depth=1
	movl	$1, %r10d
	xorl	%r11d, %r11d
	jmp	.LBB0_23
	.p2align	4, 0x90
.LBB0_6:                                # %L106.lr.ph.L106.lr.ph.split_crit_edge
                                        #   in Loop: Header=BB0_4 Depth=1
	cmpq	%rsi, %r13
	jae	.LBB0_36
# %bb.7:                                # %L106.us85.preheader
                                        #   in Loop: Header=BB0_4 Depth=1
	cmpq	$16, %rdx
	jae	.LBB0_9
# %bb.8:                                #   in Loop: Header=BB0_4 Depth=1
	movl	$1, %esi
	xorl	%ecx, %ecx
	jmp	.LBB0_12
	.p2align	4, 0x90
.LBB0_20:                               # %vector.ph334
                                        #   in Loop: Header=BB0_4 Depth=1
	movq	72(%rbx), %rdx                  # 8-byte Reload
	movl	%edx, %ecx
	andl	$15, %ecx
	testq	%rcx, %rcx
	movl	$16, %esi
	cmoveq	%rsi, %rcx
	movq	%rdx, %r11
	subq	%rcx, %r11
	negq	%rcx
	cmpl	$1426063360, %r10d              # imm = 0x55000000
	leaq	1(%rdx,%rcx), %r10
	vmovq	%rax, %xmm9
	vpermq	$85, %ymm28, %ymm10             # ymm10 = ymm28[1,1,1,1]
	movl	$0, %eax
	movl	$255, %ecx
	cmovel	%ecx, %eax
	kmovd	%eax, %k0
	vpbroadcastq	%r12, %ymm11
	vpbroadcastq	%r13, %ymm12
	movq	24(%rbx), %rax                  # 8-byte Reload
	leaq	(%rax,%r12,8), %rax
	vpxor	%xmm13, %xmm13, %xmm13
	xorl	%ecx, %ecx
	vmovdqa64	%ymm27, %ymm14
	vpxor	%xmm15, %xmm15, %xmm15
	vpxord	%xmm16, %xmm16, %xmm16
	vmovdqa64	%ymm26, %ymm17
	.p2align	4, 0x90
.LBB0_21:                               # %vector.body332
                                        #   Parent Loop BB0_4 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	vpaddq	%ymm3, %ymm14, %ymm0
	vpaddq	%ymm4, %ymm14, %ymm1
	vpaddq	%ymm5, %ymm14, %ymm2
	vpaddq	%ymm3, %ymm17, %ymm8
	vpaddq	%ymm4, %ymm17, %ymm18
	vpaddq	%ymm5, %ymm17, %ymm19
	vpcmpltuq	%ymm10, %ymm17, %k1
	kxorw	%k0, %k1, %k1
	vpaddq	%ymm11, %ymm14, %ymm20
	vpmullq	%ymm14, %ymm7, %ymm21
	vpcmpneqq	%ymm10, %ymm17, %k1 {%k1}
	vmovdqa64	%ymm20, %ymm22
	vpaddq	%ymm21, %ymm12, %ymm22 {%k1}
	vpcmpltuq	%ymm10, %ymm8, %k1
	vpaddq	%ymm11, %ymm0, %ymm21
	vpmullq	%ymm0, %ymm7, %ymm0
	kxorw	%k0, %k1, %k1
	vpcmpneqq	%ymm10, %ymm8, %k1 {%k1}
	vmovdqa64	%ymm21, %ymm23
	vpaddq	%ymm0, %ymm12, %ymm23 {%k1}
	vpcmpltuq	%ymm10, %ymm18, %k1
	vpmullq	%ymm1, %ymm7, %ymm0
	kxorw	%k0, %k1, %k1
	vpaddq	%ymm1, %ymm11, %ymm1
	vpcmpneqq	%ymm10, %ymm18, %k1 {%k1}
	vmovdqa64	%ymm1, %ymm24
	vpaddq	%ymm0, %ymm12, %ymm24 {%k1}
	vpcmpltuq	%ymm10, %ymm19, %k1
	kxorw	%k0, %k1, %k1
	vpaddq	%ymm2, %ymm11, %ymm0
	vpmullq	%ymm2, %ymm7, %ymm2
	vpcmpneqq	%ymm10, %ymm19, %k1 {%k1}
	vmovdqa64	%ymm0, %ymm25
	vpaddq	%ymm2, %ymm12, %ymm25 {%k1}
	vpcmpeqq	%ymm10, %ymm17, %k1
	vmovdqa64	%ymm20, %ymm22 {%k1}
	vpcmpeqq	%ymm10, %ymm8, %k1
	vmovdqa64	%ymm21, %ymm23 {%k1}
	vpxor	%xmm2, %xmm2, %xmm2
	kxnorw	%k0, %k0, %k1
	vpxor	%xmm8, %xmm8, %xmm8
	vgatherqpd	(%rdi,%ymm22,8), %ymm2 {%k1}
	kxnorw	%k0, %k0, %k1
	vgatherqpd	(%rdi,%ymm23,8), %ymm8 {%k1}
	vpcmpeqq	%ymm10, %ymm18, %k1
	vmovdqa64	%ymm1, %ymm24 {%k1}
	vpcmpeqq	%ymm10, %ymm19, %k1
	vmovdqa64	%ymm0, %ymm25 {%k1}
	vpxor	%xmm0, %xmm0, %xmm0
	kxnorw	%k0, %k0, %k1
	vpxor	%xmm1, %xmm1, %xmm1
	vgatherqpd	(%rdi,%ymm24,8), %ymm0 {%k1}
	kxnorw	%k0, %k0, %k1
	vgatherqpd	(%rdi,%ymm25,8), %ymm1 {%k1}
	vcmpeqpd	-96(%rax,%rcx,8), %ymm2, %ymm2
	vcmpeqpd	-64(%rax,%rcx,8), %ymm8, %ymm8
	vpsubq	%ymm2, %ymm9, %ymm9
	vcmpeqpd	-32(%rax,%rcx,8), %ymm0, %ymm0
	vcmpeqpd	(%rax,%rcx,8), %ymm1, %ymm1
	vpsubq	%ymm8, %ymm13, %ymm13
	vpsubq	%ymm0, %ymm15, %ymm15
	vpsubq	%ymm1, %ymm16, %ymm16
	addq	$16, %rcx
	vpaddq	%ymm6, %ymm14, %ymm14
	vpaddq	%ymm6, %ymm17, %ymm17
	cmpq	%rcx, %r11
	jne	.LBB0_21
# %bb.22:                               # %middle.block330
                                        #   in Loop: Header=BB0_4 Depth=1
	vpaddq	%ymm9, %ymm13, %ymm0
	vpaddq	%ymm0, %ymm15, %ymm0
	vpaddq	%ymm0, %ymm16, %ymm0
	vextracti128	$1, %ymm0, %xmm1
	vpaddq	%xmm1, %xmm0, %xmm0
	vpshufd	$238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
	vpaddq	%xmm1, %xmm0, %xmm0
	vmovq	%xmm0, %rax
	vpcmpeqd	%xmm1, %xmm1, %xmm1
	movq	16(%rbx), %rsi                  # 8-byte Reload
	jmp	.LBB0_23
	.p2align	4, 0x90
.LBB0_33:                               # %L73.us.us
                                        #   in Loop: Header=BB0_23 Depth=2
	movq	%r10, %r11
	incq	%r10
.LBB0_23:                               # %L106.us.us
                                        #   Parent Loop BB0_4 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	cmpq	%r10, %r15
	jne	.LBB0_24
# %bb.27:                               # %L108.us.us
                                        #   in Loop: Header=BB0_23 Depth=2
	leaq	(%r11,%r12), %rcx
	leaq	(%rdi,%rcx,8), %rcx
	jmp	.LBB0_28
	.p2align	4, 0x90
.LBB0_24:                               # %L124.us.us
                                        #   in Loop: Header=BB0_23 Depth=2
	cmpl	$1426063360, 32(%rbx)           # 4-byte Folded Reload
                                        # imm = 0x55000000
	sete	%cl
	cmpq	%r15, %r10
	setb	%dl
	xorb	%cl, %dl
	je	.LBB0_25
# %bb.26:                               # %L134.us.us
                                        #   in Loop: Header=BB0_23 Depth=2
	movq	%r14, %rcx
	imulq	%r11, %rcx
	addq	%r13, %rcx
	leaq	(%rdi,%rcx,8), %rcx
	jmp	.LBB0_28
	.p2align	4, 0x90
.LBB0_25:                               # %L131.us.us
                                        #   in Loop: Header=BB0_23 Depth=2
	leaq	(%r11,%r12), %rcx
	leaq	(%rdi,%rcx,8), %rcx
.LBB0_28:                               # %L140.us.us
                                        #   in Loop: Header=BB0_23 Depth=2
	cmpq	%r10, %r9
	je	.LBB0_37
# %bb.29:                               # %idxend.us.us
                                        #   in Loop: Header=BB0_23 Depth=2
	vmovsd	(%rcx), %xmm0                   # xmm0 = mem[0],zero
	addq	%r12, %r11
	vcmpeqsd	(%r8,%r11,8), %xmm0, %k0
	kmovw	%k0, %ecx
	addq	%rcx, %rax
	cmpq	%r10, %r14
	jne	.LBB0_33
# %bb.30:                               #   in Loop: Header=BB0_4 Depth=1
	movl	32(%rbx), %r10d                 # 4-byte Reload
	jmp	.LBB0_31
.LBB0_9:                                # %vector.ph
                                        #   in Loop: Header=BB0_4 Depth=1
	incq	%rdx
	movl	%edx, %esi
	andl	$15, %esi
	testq	%rsi, %rsi
	movl	$16, %ecx
	cmoveq	%rcx, %rsi
	movq	%rdx, %rcx
	subq	%rsi, %rcx
	negq	%rsi
	addq	%rdx, %rsi
	incq	%rsi
	vmovq	%rax, %xmm9
	vpermq	$85, %ymm28, %ymm10             # ymm10 = ymm28[1,1,1,1]
	vpbroadcastq	%r12, %ymm11
	vpbroadcastq	%r13, %ymm12
	movq	24(%rbx), %rax                  # 8-byte Reload
	leaq	(%rax,%r12,8), %rax
	vpxor	%xmm13, %xmm13, %xmm13
	xorl	%edx, %edx
	vmovdqa64	%ymm27, %ymm14
	vpxor	%xmm15, %xmm15, %xmm15
	vpxord	%xmm16, %xmm16, %xmm16
	vmovdqa64	%ymm26, %ymm17
	.p2align	4, 0x90
.LBB0_10:                               # %vector.body
                                        #   Parent Loop BB0_4 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	vpaddq	%ymm3, %ymm14, %ymm18
	vpaddq	%ymm4, %ymm14, %ymm19
	vpaddq	%ymm5, %ymm14, %ymm20
	vpaddq	%ymm3, %ymm17, %ymm21
	vpaddq	%ymm4, %ymm17, %ymm22
	vpaddq	%ymm5, %ymm17, %ymm23
	vpcmpltuq	%ymm10, %ymm17, %k3
	vpcmpltuq	%ymm10, %ymm21, %k4
	vpcmpltuq	%ymm10, %ymm22, %k2
	vpcmpltuq	%ymm10, %ymm23, %k1
	vpaddq	%ymm11, %ymm14, %ymm21
	vpaddq	%ymm11, %ymm18, %ymm22
	vpmullq	%ymm14, %ymm7, %ymm23
	vpaddq	%ymm11, %ymm19, %ymm24
	vpmullq	%ymm18, %ymm7, %ymm18
	vpmullq	%ymm19, %ymm7, %ymm19
	vpaddq	%ymm11, %ymm20, %ymm25
	vpmullq	%ymm20, %ymm7, %ymm20
	vpaddq	%ymm23, %ymm12, %ymm21 {%k3}
	vpaddq	%ymm18, %ymm12, %ymm22 {%k4}
	vpaddq	%ymm19, %ymm12, %ymm24 {%k2}
	vxorpd	%xmm8, %xmm8, %xmm8
	kxnorw	%k0, %k0, %k2
	vpxor	%xmm1, %xmm1, %xmm1
	kxnorw	%k0, %k0, %k3
	vpaddq	%ymm20, %ymm12, %ymm25 {%k1}
	vxorpd	%xmm2, %xmm2, %xmm2
	vgatherqpd	(%rdi,%ymm21,8), %ymm8 {%k2}
	kxnorw	%k0, %k0, %k1
	vgatherqpd	(%rdi,%ymm22,8), %ymm1 {%k3}
	vpxor	%xmm0, %xmm0, %xmm0
	vgatherqpd	(%rdi,%ymm24,8), %ymm2 {%k1}
	kxnorw	%k0, %k0, %k1
	vgatherqpd	(%rdi,%ymm25,8), %ymm0 {%k1}
	vcmpeqpd	-96(%rax,%rdx,8), %ymm8, %ymm8
	vcmpeqpd	-64(%rax,%rdx,8), %ymm1, %ymm1
	vpsubq	%ymm8, %ymm9, %ymm9
	vcmpeqpd	-32(%rax,%rdx,8), %ymm2, %ymm2
	vcmpeqpd	(%rax,%rdx,8), %ymm0, %ymm0
	vpsubq	%ymm1, %ymm13, %ymm13
	vpsubq	%ymm2, %ymm15, %ymm15
	vpsubq	%ymm0, %ymm16, %ymm16
	addq	$16, %rdx
	vpaddq	%ymm6, %ymm14, %ymm14
	vpaddq	%ymm6, %ymm17, %ymm17
	cmpq	%rdx, %rcx
	jne	.LBB0_10
# %bb.11:                               # %middle.block
                                        #   in Loop: Header=BB0_4 Depth=1
	vpaddq	%ymm9, %ymm13, %ymm0
	vpaddq	%ymm0, %ymm15, %ymm0
	vpaddq	%ymm0, %ymm16, %ymm0
	vextracti128	$1, %ymm0, %xmm1
	vpaddq	%xmm1, %xmm0, %xmm0
	vpshufd	$238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
	vpaddq	%xmm1, %xmm0, %xmm0
	vmovq	%xmm0, %rax
	vpcmpeqd	%xmm1, %xmm1, %xmm1
	jmp	.LBB0_12
	.p2align	4, 0x90
.LBB0_34:                               # %L73.us103
                                        #   in Loop: Header=BB0_12 Depth=2
	movq	%rsi, %rcx
	incq	%rsi
.LBB0_12:                               # %L106.us85
                                        #   Parent Loop BB0_4 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	cmpq	%rsi, %r15
	je	.LBB0_35
# %bb.13:                               # %L124.us91
                                        #   in Loop: Header=BB0_12 Depth=2
	cmpq	%rsi, %r9
	je	.LBB0_37
# %bb.14:                               # %idxend.us100
                                        #   in Loop: Header=BB0_12 Depth=2
	movq	%r14, %rdx
	imulq	%rcx, %rdx
	addq	%r12, %rcx
	addq	%r13, %rdx
	cmpq	%r15, %rsi
	cmovaeq	%rcx, %rdx
	vmovsd	(%rdi,%rdx,8), %xmm0            # xmm0 = mem[0],zero
	vcmpeqsd	(%r8,%rcx,8), %xmm0, %k0
	kmovw	%k0, %ecx
	addq	%rcx, %rax
	cmpq	%rsi, %r14
	jne	.LBB0_34
# %bb.15:                               #   in Loop: Header=BB0_4 Depth=1
	movq	16(%rbx), %rsi                  # 8-byte Reload
	jmp	.LBB0_31
.LBB0_17:
	movl	$1, %r9d
.LBB0_37:                               # %oob
	movq	%rsp, %rax
	movl	$16, %ecx
	subq	%rcx, %rax
	movq	56(%rbx), %rdi                  # 8-byte Reload
	cmpq	%rsp, %rax
	jge	.LBB0_40
.LBB0_39:                               # %oob
                                        # =>This Inner Loop Header: Depth=1
	xorq	$0, (%rsp)
	subq	$4096, %rsp                     # imm = 0x1000
	cmpq	%rsp, %rax
	jl	.LBB0_39
.LBB0_40:                               # %oob
	movq	%rax, %rsp
	movq	%r9, (%rax)
	movq	%r15, 8(%rax)
	movabsq	$ijl_bounds_error_ints, %rcx
	movl	$2, %edx
	movq	%rax, %rsi
	vzeroupper
	callq	*%rcx
.LBB0_36:                               # %L106.lr.ph.split.L106.lr.ph.split.split_crit_edge
	movl	$1, %r9d
	cmpq	$1, %r15
	jne	.LBB0_37
.LBB0_35:                               # %L121
	movabsq	$j_throw_uplo_1303, %rax
	vzeroupper
	callq	*%rax
.LBB0_41:                               # %L31
	movq	16(%r12), %rdi
	movabsq	$ijl_gc_pool_alloc, %r15
	movl	$1440, %esi                     # imm = 0x5A0
	movl	$32, %edx
	vmovdqa	%xmm0, 32(%rbx)                 # 16-byte Spill
	vzeroupper
	callq	*%r15
	movq	%r15, %rcx
	movq	%rax, %r15
	movabsq	$140054640149440, %rax          # imm = 0x7F61031343C0
	movq	%rax, -8(%r15)
	vmovaps	32(%rbx), %xmm0                 # 16-byte Reload
	vmovups	%xmm0, (%r15)
	movq	%r15, 120(%rbx)
	movq	16(%r12), %rdi
	movl	$1440, %esi                     # imm = 0x5A0
	movl	$32, %edx
	callq	*%rcx
	movabsq	$140054640149440, %rcx          # imm = 0x7F61031343C0
	movq	%rcx, -8(%rax)
	movq	%r14, (%rax)
	movq	%r13, 8(%rax)
	movq	%rax, 112(%rbx)
	movabsq	$140054646876480, %rcx          # imm = 0x7F610379E940
	movq	%rcx, 144(%rbx)
	movq	%r15, 152(%rbx)
	movq	%rax, 160(%rbx)
	movabsq	$ijl_invoke, %rax
	movabsq	$140054721237520, %rdi          # imm = 0x7F6107E89210
	leaq	144(%rbx), %rsi
	movabsq	$140054833311232, %rcx          # imm = 0x7F610E96AE00
	movl	$3, %edx
	callq	*%rax
	ud2
.Lfunc_end0:
	.size	julia_g_1301, .Lfunc_end0-julia_g_1301
	.cfi_endproc
                                        # -- End function
	.section	".note.GNU-stack","",@progbits