Fast logsumexp

LLVM. GCC will vectorize log/exp/sin/etc with the appropriate optimization flags, but LLVM needs those in addition to -fveclib=SVML or some other vector library.


using SIMDPirates, SLEEFPirates, LoopVectorization
function logsumexp_simdpirates!(w::Vector{T},we) where T
    offset = maximum(w)
    N      = length(w)
    sl     = SIMDPirates.Vec{4,T}((0.,0.,0.,0.))
    @inbounds @simd for i = 1:4:N
        wl = SIMDPirates.vload(SIMDPirates.Vec{4,T}, w, i)
        @pirate wel = SLEEFPirates.exp(wl-offset)
        @pirate sl += wel
        SIMDPirates.vstore!(we,wel,i)
    end
    s    = vsum(sl)
    w  .-= log(s) + offset
    we .*= 1/s
end

function logsumexp_loopvectorization!(w::Vector{T},we) where T
    offset = maximum(w)
    N      = length(w)
    s = zero(T)
    @vectorize for i = 1:N
        wl = w[i]
        wel = exp(wl-offset)
        we[i] = wel
        s += wel
    end
    w  .-= log(s) + offset
    we .*= 1/s
end

function logsumexp_sleefpirates!(w::Vector{T},we) where T
    offset = maximum(w)
    N      = length(w)
    s = zero(T)
    @inbounds @simd for i = 1:N
        wl = w[i]
        wel = SLEEFPirates.exp(wl-offset)
        we[i] = wel
        s += wel
    end
    w  .-= log(s) + offset
    we .*= 1/s
end

A few results:

julia> @btime logsumexp!($w,$we);
  7.844 μs (0 allocations: 0 bytes)

julia> @btime logsumexp_yeppp!($w,$we);
  9.393 μs (0 allocations: 0 bytes)

julia> @btime logsumexp_simdpirates!($w,$we);
  2.576 μs (0 allocations: 0 bytes)

julia> @btime logsumexp_loopvectorization!($w,$we);
  1.825 μs (0 allocations: 0 bytes)

julia> @btime logsumexp_sleefpirates!($w,$we);
  1.705 μs (0 allocations: 0 bytes)

julia> @code_native logsumexp_sleefpirates!(w,we)
	.text
; ┌ @ REPL[43]:2 within `logsumexp_sleefpirates!'
	pushq	%r15
	pushq	%r14
	pushq	%rbx
	subq	$224, %rsp
	movq	%rsi, 56(%rsp)
	movq	(%rsi), %r15
	movq	8(%rsi), %r14
; │┌ @ reducedim.jl:652 within `maximum'
; ││┌ @ reducedim.jl:652 within `#maximum#562'
; │││┌ @ reducedim.jl:656 within `_maximum' @ reducedim.jl:657
; ││││┌ @ reducedim.jl:307 within `mapreduce'
; │││││┌ @ reducedim.jl:307 within `#mapreduce#555'
; ││││││┌ @ reducedim.jl:312 within `_mapreduce_dim'
	movabsq	$"size;", %rax
	movq	%r15, %rdi
	callq	*%rax
; │└└└└└└
; │ @ REPL[43]:3 within `logsumexp_sleefpirates!'
; │┌ @ array.jl:200 within `length'
	movq	8(%r15), %rax
; │└
; │ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:69 within `macro expansion'
; ││┌ @ range.jl:5 within `Colon'
; │││┌ @ range.jl:275 within `Type'
; ││││┌ @ range.jl:280 within `unitrange_last'
	movq	%rax, %rcx
	sarq	$63, %rcx
	andnq	%rax, %rcx, %r8
; │└└└└
; │┌ @ checked.jl:194 within `macro expansion'
	leaq	-1(%r8), %rsi
; │└
; │┌ @ simdloop.jl:71 within `macro expansion'
; ││┌ @ simdloop.jl:51 within `simd_inner_length'
; │││┌ @ range.jl:541 within `length'
; ││││┌ @ checked.jl:165 within `checked_add'
; │││││┌ @ checked.jl:132 within `add_with_overflow'
	movq	%rsi, %r9
	incq	%r9
; │││││└
; │││││ @ checked.jl:166 within `checked_add'
	jo	L2278
; │└└└└
; │┌ @ int.jl:49 within `macro expansion'
	testq	%r9, %r9
	vmovapd	%xmm0, 32(%rsp)
; └└
; ┌ @ simdloop.jl:72 within `logsumexp_sleefpirates!'
	jle	L107
	movq	(%r15), %rcx
	movq	(%r14), %rdx
	vxorpd	%xmm19, %xmm19, %xmm19
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
	cmpq	$32, %r8
	jae	L118
	xorl	%esi, %esi
	jmp	L1345
L107:
	vxorpd	%xmm19, %xmm19, %xmm19
	jmp	L1757
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
L118:
	leaq	(%rcx,%r8,8), %rsi
	cmpq	%rsi, %rdx
	jae	L143
	leaq	(%rdx,%r8,8), %rsi
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
	cmpq	%rsi, %rcx
	jae	L143
	xorl	%esi, %esi
	jmp	L1345
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
L143:
	movabsq	$9223372036854775776, %rsi # imm = 0x7FFFFFFFFFFFFFE0
	andq	%r8, %rsi
	vbroadcastsd	%xmm0, %zmm0
	vmovupd	%zmm0, 64(%rsp)
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%edi, %edi
	movabsq	$139756740332128, %rbx  # imm = 0x7F1BA6DCCA60
	vbroadcastsd	(%rbx), %zmm1
	vmovups	%zmm1, 128(%rsp)
	movabsq	$139756740332136, %rbx  # imm = 0x7F1BA6DCCA68
	vbroadcastsd	(%rbx), %zmm3
	movabsq	$139756740332144, %rbx  # imm = 0x7F1BA6DCCA70
	vbroadcastsd	(%rbx), %zmm4
	movabsq	$139756740332152, %rbx  # imm = 0x7F1BA6DCCA78
	vbroadcastsd	(%rbx), %zmm1
	movabsq	$139756740332160, %rbx  # imm = 0x7F1BA6DCCA80
	vbroadcastsd	(%rbx), %zmm6
	movabsq	$139756740332168, %rbx  # imm = 0x7F1BA6DCCA88
	vbroadcastsd	(%rbx), %zmm7
	movabsq	$139756740332176, %rbx  # imm = 0x7F1BA6DCCA90
	vbroadcastsd	(%rbx), %zmm8
	movabsq	$139756740332184, %rbx  # imm = 0x7F1BA6DCCA98
	vbroadcastsd	(%rbx), %zmm9
	movabsq	$139756740332192, %rbx  # imm = 0x7F1BA6DCCAA0
	vbroadcastsd	(%rbx), %zmm10
	movabsq	$139756740332200, %rbx  # imm = 0x7F1BA6DCCAA8
	vbroadcastsd	(%rbx), %zmm11
	movabsq	$139756740332208, %rbx  # imm = 0x7F1BA6DCCAB0
	vbroadcastsd	(%rbx), %zmm12
	movabsq	$139756740332216, %rbx  # imm = 0x7F1BA6DCCAB8
	vbroadcastsd	(%rbx), %zmm13
	movabsq	$139756740332224, %rbx  # imm = 0x7F1BA6DCCAC0
	vbroadcastsd	(%rbx), %zmm14
	movabsq	$139756740332232, %rbx  # imm = 0x7F1BA6DCCAC8
	vbroadcastsd	(%rbx), %zmm15
	movabsq	$139756740332240, %rbx  # imm = 0x7F1BA6DCCAD0
	vbroadcastsd	(%rbx), %zmm16
	vpbroadcastq	(%rbx), %zmm17
	vxorpd	%xmm18, %xmm18, %xmm18
	vxorpd	%xmm19, %xmm19, %xmm19
	vxorpd	%xmm20, %xmm20, %xmm20
; └
; ┌ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:6
; ││┌ @ array.jl:728 within `getindex'
L448:
	vmovupd	(%rcx,%rdi,8), %zmm5
	vmovupd	64(%rcx,%rdi,8), %zmm21
	vmovupd	128(%rcx,%rdi,8), %zmm22
	vmovupd	192(%rcx,%rdi,8), %zmm23
	vmovupd	64(%rsp), %zmm2
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ float.jl:397
	vsubpd	%zmm2, %zmm5, %zmm5
	vsubpd	%zmm2, %zmm21, %zmm29
	vsubpd	%zmm2, %zmm22, %zmm30
	vsubpd	%zmm2, %zmm23, %zmm31
	vmovupd	128(%rsp), %zmm2
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:7
; ││┌ @ exp.jl:181 within `exp'
; │││┌ @ float.jl:399 within `*'
	vmulpd	%zmm2, %zmm5, %zmm21
	vmulpd	%zmm2, %zmm29, %zmm22
	vmulpd	%zmm2, %zmm30, %zmm23
	vmulpd	%zmm2, %zmm31, %zmm24
; ││└└
; ││┌ @ float.jl:370 within `exp'
	vrndscalepd	$4, %zmm21, %zmm25
	vrndscalepd	$4, %zmm22, %zmm26
	vrndscalepd	$4, %zmm23, %zmm27
	vrndscalepd	$4, %zmm24, %zmm28
; ││└
; ││┌ @ exp.jl:183 within `exp'
; │││┌ @ float.jl:304 within `unsafe_trunc'
	vcvttpd2qq	%zmm25, %zmm21
	vcvttpd2qq	%zmm26, %zmm22
	vcvttpd2qq	%zmm27, %zmm23
	vcvttpd2qq	%zmm28, %zmm24
; ││└└
; ││┌ @ float.jl:404 within `exp'
	vfnmadd231pd	%zmm3, %zmm25, %zmm5
	vfnmadd231pd	%zmm3, %zmm26, %zmm29
	vfnmadd231pd	%zmm3, %zmm27, %zmm30
	vfnmadd231pd	%zmm3, %zmm28, %zmm31
; ││└
; ││┌ @ exp.jl:186 within `exp'
; │││┌ @ float.jl:404 within `muladd'
	vfnmadd213pd	%zmm5, %zmm4, %zmm25
	vfnmadd213pd	%zmm29, %zmm4, %zmm26
	vfnmadd213pd	%zmm30, %zmm4, %zmm27
	vfnmadd213pd	%zmm31, %zmm4, %zmm28
; │││└
; │││ @ exp.jl:188 within `exp'
; │││┌ @ exp.jl:161 within `exp_kernel'
; ││││┌ @ math.jl:101 within `macro expansion'
; │││││┌ @ float.jl:404 within `muladd'
	vmovapd	%zmm1, %zmm29
	vfmadd213pd	%zmm6, %zmm25, %zmm29
	vmovapd	%zmm1, %zmm30
	vfmadd213pd	%zmm6, %zmm26, %zmm30
	vmovapd	%zmm1, %zmm31
	vfmadd213pd	%zmm6, %zmm27, %zmm31
	vmovapd	%zmm1, %zmm5
	vfmadd213pd	%zmm6, %zmm28, %zmm5
	vfmadd213pd	%zmm7, %zmm25, %zmm29
	vfmadd213pd	%zmm7, %zmm26, %zmm30
	vfmadd213pd	%zmm7, %zmm27, %zmm31
	vfmadd213pd	%zmm7, %zmm28, %zmm5
	vfmadd213pd	%zmm8, %zmm25, %zmm29
	vfmadd213pd	%zmm8, %zmm26, %zmm30
	vfmadd213pd	%zmm8, %zmm27, %zmm31
	vfmadd213pd	%zmm8, %zmm28, %zmm5
	vfmadd213pd	%zmm9, %zmm25, %zmm29
	vfmadd213pd	%zmm9, %zmm26, %zmm30
	vfmadd213pd	%zmm9, %zmm27, %zmm31
	vfmadd213pd	%zmm9, %zmm28, %zmm5
	vfmadd213pd	%zmm10, %zmm25, %zmm29
	vfmadd213pd	%zmm10, %zmm26, %zmm30
	vfmadd213pd	%zmm10, %zmm27, %zmm31
	vfmadd213pd	%zmm10, %zmm28, %zmm5
	vfmadd213pd	%zmm11, %zmm25, %zmm29
	vfmadd213pd	%zmm11, %zmm26, %zmm30
	vfmadd213pd	%zmm11, %zmm27, %zmm31
	vfmadd213pd	%zmm11, %zmm28, %zmm5
	vfmadd213pd	%zmm12, %zmm25, %zmm29
	vfmadd213pd	%zmm12, %zmm26, %zmm30
	vfmadd213pd	%zmm12, %zmm27, %zmm31
	vfmadd213pd	%zmm12, %zmm28, %zmm5
	vfmadd213pd	%zmm13, %zmm25, %zmm29
	vfmadd213pd	%zmm13, %zmm26, %zmm30
	vfmadd213pd	%zmm13, %zmm27, %zmm31
	vfmadd213pd	%zmm13, %zmm28, %zmm5
	vfmadd213pd	%zmm14, %zmm25, %zmm29
	vfmadd213pd	%zmm14, %zmm26, %zmm30
	vfmadd213pd	%zmm14, %zmm27, %zmm31
	vfmadd213pd	%zmm14, %zmm28, %zmm5
	vfmadd213pd	%zmm15, %zmm25, %zmm29
	vfmadd213pd	%zmm15, %zmm26, %zmm30
	vfmadd213pd	%zmm15, %zmm27, %zmm31
	vfmadd213pd	%zmm15, %zmm28, %zmm5
; ││└└└└
; ││┌ @ float.jl:399 within `exp'
	vmulpd	%zmm25, %zmm25, %zmm2
	vmulpd	%zmm29, %zmm2, %zmm2
	vmulpd	%zmm26, %zmm26, %zmm29
	vmulpd	%zmm30, %zmm29, %zmm29
	vmulpd	%zmm27, %zmm27, %zmm30
	vmulpd	%zmm31, %zmm30, %zmm30
	vmulpd	%zmm28, %zmm28, %zmm31
	vmulpd	%zmm5, %zmm31, %zmm5
; ││└
; ││┌ @ exp.jl:189 within `exp'
; │││┌ @ operators.jl:529 within `+' @ float.jl:395
	vaddpd	%zmm2, %zmm25, %zmm2
	vaddpd	%zmm29, %zmm26, %zmm25
	vaddpd	%zmm30, %zmm27, %zmm26
	vaddpd	%zmm5, %zmm28, %zmm5
; │││└
; │││┌ @ float.jl:395 within `+'
	vaddpd	%zmm16, %zmm2, %zmm2
	vaddpd	%zmm16, %zmm25, %zmm25
	vaddpd	%zmm16, %zmm26, %zmm26
	vaddpd	%zmm16, %zmm5, %zmm5
; │││└
; │││ @ exp.jl:190 within `exp'
; │││┌ @ priv.jl:51 within `ldexp2k'
; ││││┌ @ int.jl:444 within `>>' @ int.jl:437
	vpsraq	$1, %zmm21, %zmm27
	vpsraq	$1, %zmm22, %zmm28
	vpsraq	$1, %zmm23, %zmm29
	vpsraq	$1, %zmm24, %zmm30
; ││││└
; ││││ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	vpsllq	$52, %zmm27, %zmm31
	vpaddq	%zmm17, %zmm31, %zmm31
; │││└└└└
; │││┌ @ float.jl:399 within `ldexp2k'
	vmulpd	%zmm31, %zmm2, %zmm2
; │││└
; │││┌ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	vpsllq	$52, %zmm28, %zmm31
	vpaddq	%zmm17, %zmm31, %zmm31
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulpd	%zmm31, %zmm25, %zmm25
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	vpsllq	$52, %zmm29, %zmm31
	vpaddq	%zmm17, %zmm31, %zmm31
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulpd	%zmm31, %zmm26, %zmm26
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	vpsllq	$52, %zmm30, %zmm31
	vpaddq	%zmm17, %zmm31, %zmm31
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulpd	%zmm31, %zmm5, %zmm5
; ││││└└
; ││││┌ @ int.jl:52 within `-'
	vpsubq	%zmm27, %zmm21, %zmm21
	vpsubq	%zmm28, %zmm22, %zmm22
	vpsubq	%zmm29, %zmm23, %zmm23
	vpsubq	%zmm30, %zmm24, %zmm24
; │││└└
; │││┌ @ int.jl:439 within `ldexp2k'
	vpsllq	$52, %zmm21, %zmm21
	vpaddq	%zmm17, %zmm21, %zmm21
; │││└
; │││┌ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulpd	%zmm21, %zmm2, %zmm2
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	vpsllq	$52, %zmm22, %zmm21
	vpaddq	%zmm17, %zmm21, %zmm21
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulpd	%zmm21, %zmm25, %zmm21
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	vpsllq	$52, %zmm23, %zmm22
	vpaddq	%zmm17, %zmm22, %zmm22
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulpd	%zmm22, %zmm26, %zmm22
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	vpsllq	$52, %zmm24, %zmm23
	vpaddq	%zmm17, %zmm23, %zmm23
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulpd	%zmm23, %zmm5, %zmm5
; ││└└└└
; ││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
	vmovupd	%zmm2, (%rdx,%rdi,8)
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
	vaddpd	%zmm2, %zmm0, %zmm0
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
	vmovupd	%zmm21, 64(%rdx,%rdi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
	vaddpd	%zmm21, %zmm18, %zmm18
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
	vmovupd	%zmm22, 128(%rdx,%rdi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
	vaddpd	%zmm22, %zmm19, %zmm19
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
	vmovupd	%zmm5, 192(%rdx,%rdi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
	vaddpd	%zmm5, %zmm20, %zmm20
; │└└
; │┌ @ int.jl:53 within `macro expansion'
	addq	$32, %rdi
	cmpq	%rdi, %rsi
	jne	L448
; │└
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
	vaddpd	%zmm0, %zmm18, %zmm0
	vaddpd	%zmm0, %zmm19, %zmm0
	vaddpd	%zmm0, %zmm20, %zmm0
	vextractf64x4	$1, %zmm0, %ymm1
	vaddpd	%zmm1, %zmm0, %zmm0
	vextractf128	$1, %ymm0, %xmm1
	vaddpd	%zmm1, %zmm0, %zmm0
	vpermilpd	$1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
	vaddpd	%zmm1, %zmm0, %zmm19
	cmpq	%rsi, %r8
	vmovapd	32(%rsp), %xmm0
; └└└
; ┌ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
	je	L1757
L1345:
	movabsq	$4607182418800017408, %rdi # imm = 0x3FF0000000000000
	movabsq	$139756740332128, %rbx  # imm = 0x7F1BA6DCCA60
	vmovsd	(%rbx), %xmm8           # xmm8 = mem[0],zero
	movabsq	$139756740332248, %rbx  # imm = 0x7F1BA6DCCAD8
	vmovsd	(%rbx), %xmm9           # xmm9 = mem[0],zero
	movabsq	$139756740332256, %rbx  # imm = 0x7F1BA6DCCAE0
	vmovsd	(%rbx), %xmm10          # xmm10 = mem[0],zero
	movabsq	$139756740332152, %rbx  # imm = 0x7F1BA6DCCA78
	vmovsd	(%rbx), %xmm11          # xmm11 = mem[0],zero
	movabsq	$139756740332160, %rbx  # imm = 0x7F1BA6DCCA80
	vmovsd	(%rbx), %xmm12          # xmm12 = mem[0],zero
	movabsq	$139756740332168, %rbx  # imm = 0x7F1BA6DCCA88
	vmovsd	(%rbx), %xmm13          # xmm13 = mem[0],zero
	movabsq	$139756740332176, %rbx  # imm = 0x7F1BA6DCCA90
	vmovsd	(%rbx), %xmm14          # xmm14 = mem[0],zero
	movabsq	$139756740332184, %rbx  # imm = 0x7F1BA6DCCA98
	vmovsd	(%rbx), %xmm15          # xmm15 = mem[0],zero
	movabsq	$139756740332192, %rbx  # imm = 0x7F1BA6DCCAA0
	vmovsd	(%rbx), %xmm16          # xmm16 = mem[0],zero
	movabsq	$139756740332200, %rbx  # imm = 0x7F1BA6DCCAA8
	vmovsd	(%rbx), %xmm17          # xmm17 = mem[0],zero
	movabsq	$139756740332208, %rbx  # imm = 0x7F1BA6DCCAB0
	vmovsd	(%rbx), %xmm18          # xmm18 = mem[0],zero
	movabsq	$139756740332216, %rbx  # imm = 0x7F1BA6DCCAB8
	vmovsd	(%rbx), %xmm3           # xmm3 = mem[0],zero
	movabsq	$139756740332224, %rbx  # imm = 0x7F1BA6DCCAC0
	vmovsd	(%rbx), %xmm4           # xmm4 = mem[0],zero
	movabsq	$139756740332232, %rbx  # imm = 0x7F1BA6DCCAC8
	vmovsd	(%rbx), %xmm5           # xmm5 = mem[0],zero
	movabsq	$139756740332240, %rbx  # imm = 0x7F1BA6DCCAD0
	vmovsd	(%rbx), %xmm6           # xmm6 = mem[0],zero
	nopw	%cs:(%rax,%rax)
; └
; ┌ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:6
; ││┌ @ array.jl:728 within `getindex'
L1584:
	vmovsd	(%rcx,%rsi,8), %xmm7    # xmm7 = mem[0],zero
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ float.jl:397
	vsubsd	%xmm0, %xmm7, %xmm7
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:7
; ││┌ @ exp.jl:181 within `exp'
; │││┌ @ float.jl:399 within `*'
	vmulsd	%xmm8, %xmm7, %xmm1
; │││└
; │││┌ @ floatfuncs.jl:129 within `round' @ float.jl:370
	vrndscalesd	$4, %xmm1, %xmm1, %xmm1
; ││└└
; ││┌ @ float.jl:304 within `exp'
	vcvttsd2si	%xmm1, %rax
; ││└
; ││┌ @ exp.jl:185 within `exp'
; │││┌ @ float.jl:404 within `muladd'
	vfmadd231sd	%xmm9, %xmm1, %xmm7
; │││└
; │││ @ exp.jl:186 within `exp'
; │││┌ @ float.jl:404 within `muladd'
	vfmadd231sd	%xmm10, %xmm1, %xmm7
; │││└
; │││ @ exp.jl:188 within `exp'
; │││┌ @ exp.jl:161 within `exp_kernel'
; ││││┌ @ math.jl:101 within `macro expansion'
; │││││┌ @ float.jl:404 within `muladd'
	vmovapd	%xmm11, %xmm2
	vfmadd213sd	%xmm12, %xmm7, %xmm2
	vfmadd213sd	%xmm13, %xmm7, %xmm2
	vfmadd213sd	%xmm14, %xmm7, %xmm2
	vfmadd213sd	%xmm15, %xmm7, %xmm2
	vfmadd213sd	%xmm16, %xmm7, %xmm2
	vfmadd213sd	%xmm17, %xmm7, %xmm2
	vfmadd213sd	%xmm18, %xmm7, %xmm2
	vfmadd213sd	%xmm3, %xmm7, %xmm2
	vfmadd213sd	%xmm4, %xmm7, %xmm2
	vfmadd213sd	%xmm5, %xmm7, %xmm2
; ││└└└└
; ││┌ @ float.jl:399 within `exp'
	vmulsd	%xmm7, %xmm7, %xmm1
	vmulsd	%xmm2, %xmm1, %xmm1
; ││└
; ││┌ @ exp.jl:189 within `exp'
; │││┌ @ operators.jl:529 within `+' @ float.jl:395
	vaddsd	%xmm1, %xmm7, %xmm1
; │││└
; │││┌ @ float.jl:395 within `+'
	vaddsd	%xmm6, %xmm1, %xmm2
; ││└└
; ││┌ @ int.jl:437 within `exp'
	movq	%rax, %rbx
	sarq	%rbx
; ││└
; ││┌ @ exp.jl:190 within `exp'
; │││┌ @ int.jl:52 within `ldexp2k'
	subl	%ebx, %eax
; │││└
; │││┌ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	shlq	$52, %rbx
	addq	%rdi, %rbx
; │││││└└
; │││││┌ @ essentials.jl:417 within `integer2float'
	vmovq	%rbx, %xmm1
; ││││└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulsd	%xmm1, %xmm2, %xmm2
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
	shlq	$52, %rax
	addq	%rdi, %rax
; ││││││└
; ││││││┌ @ essentials.jl:417 within `reinterpret'
	vmovq	%rax, %xmm1
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
	vmulsd	%xmm1, %xmm2, %xmm1
; ││└└└└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
	vmovsd	%xmm1, (%rdx,%rsi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ float.jl:395
	vaddsd	%xmm1, %xmm19, %xmm19
; ││ @ simdloop.jl:78 within `macro expansion'
; ││┌ @ int.jl:53 within `+'
	addq	$1, %rsi
; ││└
; ││ @ simdloop.jl:75 within `macro expansion'
; ││┌ @ int.jl:49 within `<'
	cmpq	%r9, %rsi
; ││└
	jb	L1584
; │└
; │ @ REPL[43]:11 within `logsumexp_sleefpirates!'
L1757:
	movabsq	$"<;", %rax
	vmovupd	%zmm19, 64(%rsp)
	vmovapd	%xmm19, %xmm0
	vzeroupper
	callq	*%rax
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ abstractarray.jl:75 within `axes'
; │││┌ @ array.jl:155 within `size'
	movq	24(%r15), %rdx
; ││└└
; ││┌ @ promotion.jl:414 within `axes'
	testq	%rdx, %rdx
; ││└
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:72 within `macro expansion'
	jle	L2039
; └└└└
; ┌ @ simdloop.jl within `logsumexp_sleefpirates!'
	movq	%rdx, %rax
	sarq	$63, %rax
	andnq	%rdx, %rax, %rax
; └
; ┌ @ REPL[43]:11 within `logsumexp_sleefpirates!'
; │┌ @ float.jl:395 within `+'
	vaddsd	32(%rsp), %xmm0, %xmm0
	movq	(%r15), %rcx
; │└
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:886
; │││┌ @ broadcast.jl:869 within `preprocess'
; ││││┌ @ broadcast.jl:872 within `preprocess_args'
; │││││┌ @ broadcast.jl:870 within `preprocess'
; ││││││┌ @ broadcast.jl:592 within `extrude'
; │││││││┌ @ broadcast.jl:547 within `newindexer'
; ││││││││┌ @ broadcast.jl:548 within `shapeindexer'
; │││││││││┌ @ broadcast.jl:553 within `_newindexer'
; ││││││││││┌ @ operators.jl:193 within `!='
; │││││││││││┌ @ promotion.jl:403 within `=='
	cmpq	$1, %rdx
; │││└└└└└└└└└
; │││ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:75 within `macro expansion'
	jne	L1867
	xorl	%edx, %edx
	nopw	%cs:(%rax,%rax)
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:597 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:621 within `_getindex'
; │││││││┌ @ broadcast.jl:591 within `_broadcast_getindex'
; ││││││││┌ @ array.jl:728 within `getindex'
L1840:
	vmovsd	(%rcx), %xmm1           # xmm1 = mem[0],zero
; ││││││└└└
; ││││││ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:397 within `-'
	vsubsd	%xmm0, %xmm1, %xmm1
; ││││└└└└
; ││││┌ @ array.jl:766 within `setindex!'
	vmovsd	%xmm1, (%rcx,%rdx,8)
; │││└└
; │││┌ @ int.jl:53 within `macro expansion'
	addq	$1, %rdx
; │││└
; │││┌ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
	cmpq	%rax, %rdx
; ││││└
	jb	L1840
	jmp	L2039
L1867:
	cmpq	$32, %rax
	jae	L1880
	xorl	%edx, %edx
	jmp	L2016
; ││││ @ simdloop.jl:75 within `macro expansion'
L1880:
	movabsq	$9223372036854775776, %rdx # imm = 0x7FFFFFFFFFFFFFE0
	andq	%rax, %rdx
	vbroadcastsd	%xmm0, %zmm1
	leaq	192(%rcx), %rsi
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
	movq	%rdx, %rdi
	nopw	%cs:(%rax,%rax)
; ││││└
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:597 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:621 within `_getindex'
; │││││││┌ @ broadcast.jl:591 within `_broadcast_getindex'
; ││││││││┌ @ array.jl:728 within `getindex'
L1920:
	vmovupd	-192(%rsi), %zmm2
	vmovupd	-128(%rsi), %zmm3
	vmovupd	-64(%rsi), %zmm4
	vmovupd	(%rsi), %zmm5
; ││││││└└└
; ││││││ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:397 within `-'
	vsubpd	%zmm1, %zmm2, %zmm2
	vsubpd	%zmm1, %zmm3, %zmm3
	vsubpd	%zmm1, %zmm4, %zmm4
	vsubpd	%zmm1, %zmm5, %zmm5
; ││││└└└└
; ││││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
	vmovupd	%zmm2, -192(%rsi)
	vmovupd	%zmm3, -128(%rsi)
	vmovupd	%zmm4, -64(%rsi)
	vmovupd	%zmm5, (%rsi)
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
	addq	$256, %rsi              # imm = 0x100
	addq	$-32, %rdi
	jne	L1920
; └└└└└
; ┌ @ int.jl within `logsumexp_sleefpirates!'
	cmpq	%rdx, %rax
; └
; ┌ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
	je	L2039
; └
; ┌ @ REPL[43]:11 within `logsumexp_sleefpirates!'
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:597 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:621 within `_getindex'
; │││││││┌ @ broadcast.jl:591 within `_broadcast_getindex'
; ││││││││┌ @ array.jl:728 within `getindex'
L2016:
	vmovsd	(%rcx,%rdx,8), %xmm1    # xmm1 = mem[0],zero
; │││││└└└└
; │││││┌ @ float.jl:397 within `_broadcast_getindex'
	vsubsd	%xmm0, %xmm1, %xmm1
; ││││└└
; ││││┌ @ array.jl:766 within `setindex!'
	vmovsd	%xmm1, (%rcx,%rdx,8)
; ││││└
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
	addq	$1, %rdx
; ││││└
; ││││ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
	cmpq	%rax, %rdx
	jb	L2016
; │└└└└
; │ @ REPL[43]:12 within `logsumexp_sleefpirates!'
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ abstractarray.jl:75 within `axes'
; │││┌ @ array.jl:155 within `size'
L2039:
	movq	24(%r14), %rdx
; ││└└
; ││┌ @ promotion.jl:414 within `axes'
	testq	%rdx, %rdx
; ││└
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:72 within `macro expansion'
	jle	L2259
; └└└└
; ┌ @ simdloop.jl within `logsumexp_sleefpirates!'
	movq	%rdx, %rax
	sarq	$63, %rax
	andnq	%rdx, %rax, %rax
	movabsq	$139756740332240, %rcx  # imm = 0x7F1BA6DCCAD0
; └
; ┌ @ REPL[43]:12 within `logsumexp_sleefpirates!'
; │┌ @ promotion.jl:316 within `/' @ float.jl:401
	vmovsd	(%rcx), %xmm0           # xmm0 = mem[0],zero
	vdivsd	64(%rsp), %xmm0, %xmm0
	movq	(%r14), %rcx
; │└
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:886
; │││┌ @ broadcast.jl:869 within `preprocess'
; ││││┌ @ broadcast.jl:872 within `preprocess_args'
; │││││┌ @ broadcast.jl:870 within `preprocess'
; ││││││┌ @ broadcast.jl:592 within `extrude'
; │││││││┌ @ broadcast.jl:547 within `newindexer'
; ││││││││┌ @ broadcast.jl:548 within `shapeindexer'
; │││││││││┌ @ broadcast.jl:553 within `_newindexer'
; ││││││││││┌ @ operators.jl:193 within `!='
; │││││││││││┌ @ promotion.jl:403 within `=='
	cmpq	$1, %rdx
; │││└└└└└└└└└
; │││ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:75 within `macro expansion'
	jne	L2119
	xorl	%edx, %edx
	nop
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:399 within `*'
L2096:
	vmulsd	(%rcx), %xmm0, %xmm1
; ││││└└└└
; ││││┌ @ array.jl:766 within `setindex!'
	vmovsd	%xmm1, (%rcx,%rdx,8)
; │││└└
; │││┌ @ int.jl:53 within `macro expansion'
	addq	$1, %rdx
; │││└
; │││┌ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
	cmpq	%rax, %rdx
; ││││└
	jb	L2096
	jmp	L2259
L2119:
	cmpq	$32, %rax
	jae	L2129
	xorl	%edx, %edx
	jmp	L2240
; ││││ @ simdloop.jl:75 within `macro expansion'
L2129:
	movabsq	$9223372036854775776, %rdx # imm = 0x7FFFFFFFFFFFFFE0
	andq	%rax, %rdx
	vbroadcastsd	%xmm0, %zmm1
	leaq	192(%rcx), %rsi
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
	movq	%rdx, %rdi
	nop
; ││││└
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:399 within `*'
L2160:
	vmulpd	-192(%rsi), %zmm1, %zmm2
	vmulpd	-128(%rsi), %zmm1, %zmm3
	vmulpd	-64(%rsi), %zmm1, %zmm4
	vmulpd	(%rsi), %zmm1, %zmm5
; ││││└└└└
; ││││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
	vmovupd	%zmm2, -192(%rsi)
	vmovupd	%zmm3, -128(%rsi)
	vmovupd	%zmm4, -64(%rsi)
	vmovupd	%zmm5, (%rsi)
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
	addq	$256, %rsi              # imm = 0x100
	addq	$-32, %rdi
	jne	L2160
; └└└└└
; ┌ @ int.jl within `logsumexp_sleefpirates!'
	cmpq	%rdx, %rax
; └
; ┌ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
	je	L2259
	nopl	(%rax,%rax)
; └
; ┌ @ REPL[43]:12 within `logsumexp_sleefpirates!'
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:399 within `*'
L2240:
	vmulsd	(%rcx,%rdx,8), %xmm0, %xmm1
; ││││└└└└
; ││││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
	vmovsd	%xmm1, (%rcx,%rdx,8)
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
	addq	$1, %rdx
; ││││└
; ││││ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
	cmpq	%rax, %rdx
	jb	L2240
; │└└└└
L2259:
	movq	%r14, %rax
	addq	$224, %rsp
	popq	%rbx
	popq	%r14
	popq	%r15
	vzeroupper
	retq
; │ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:71 within `macro expansion'
; ││┌ @ simdloop.jl:51 within `simd_inner_length'
; │││┌ @ range.jl:541 within `length'
; ││││┌ @ checked.jl:166 within `checked_add'
L2278:
	movabsq	$throw_overflowerr_binaryop, %rax
	movabsq	$139757138773328, %rdi  # imm = 0x7F1BBE9C8550
	movl	$1, %edx
	callq	*%rax
	ud2
	nopw	%cs:(%rax,%rax)
; └└└└└

SLEEFPirates is vectorized.

9 Likes