LLVM. GCC will vectorize log/exp/sin/etc with the appropriate optimization flags, but LLVM needs those in addition to -fveclib=SVML
or some other vector library.
using SIMDPirates, SLEEFPirates, LoopVectorization
function logsumexp_simdpirates!(w::Vector{T},we) where T
offset = maximum(w)
N = length(w)
sl = SIMDPirates.Vec{4,T}((0.,0.,0.,0.))
@inbounds @simd for i = 1:4:N
wl = SIMDPirates.vload(SIMDPirates.Vec{4,T}, w, i)
@pirate wel = SLEEFPirates.exp(wl-offset)
@pirate sl += wel
SIMDPirates.vstore!(we,wel,i)
end
s = vsum(sl)
w .-= log(s) + offset
we .*= 1/s
end
function logsumexp_loopvectorization!(w::Vector{T},we) where T
offset = maximum(w)
N = length(w)
s = zero(T)
@vectorize for i = 1:N
wl = w[i]
wel = exp(wl-offset)
we[i] = wel
s += wel
end
w .-= log(s) + offset
we .*= 1/s
end
function logsumexp_sleefpirates!(w::Vector{T},we) where T
offset = maximum(w)
N = length(w)
s = zero(T)
@inbounds @simd for i = 1:N
wl = w[i]
wel = SLEEFPirates.exp(wl-offset)
we[i] = wel
s += wel
end
w .-= log(s) + offset
we .*= 1/s
end
A few results:
julia> @btime logsumexp!($w,$we);
7.844 μs (0 allocations: 0 bytes)
julia> @btime logsumexp_yeppp!($w,$we);
9.393 μs (0 allocations: 0 bytes)
julia> @btime logsumexp_simdpirates!($w,$we);
2.576 μs (0 allocations: 0 bytes)
julia> @btime logsumexp_loopvectorization!($w,$we);
1.825 μs (0 allocations: 0 bytes)
julia> @btime logsumexp_sleefpirates!($w,$we);
1.705 μs (0 allocations: 0 bytes)
julia> @code_native logsumexp_sleefpirates!(w,we)
.text
; ┌ @ REPL[43]:2 within `logsumexp_sleefpirates!'
pushq %r15
pushq %r14
pushq %rbx
subq $224, %rsp
movq %rsi, 56(%rsp)
movq (%rsi), %r15
movq 8(%rsi), %r14
; │┌ @ reducedim.jl:652 within `maximum'
; ││┌ @ reducedim.jl:652 within `#maximum#562'
; │││┌ @ reducedim.jl:656 within `_maximum' @ reducedim.jl:657
; ││││┌ @ reducedim.jl:307 within `mapreduce'
; │││││┌ @ reducedim.jl:307 within `#mapreduce#555'
; ││││││┌ @ reducedim.jl:312 within `_mapreduce_dim'
movabsq $"size;", %rax
movq %r15, %rdi
callq *%rax
; │└└└└└└
; │ @ REPL[43]:3 within `logsumexp_sleefpirates!'
; │┌ @ array.jl:200 within `length'
movq 8(%r15), %rax
; │└
; │ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:69 within `macro expansion'
; ││┌ @ range.jl:5 within `Colon'
; │││┌ @ range.jl:275 within `Type'
; ││││┌ @ range.jl:280 within `unitrange_last'
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %r8
; │└└└└
; │┌ @ checked.jl:194 within `macro expansion'
leaq -1(%r8), %rsi
; │└
; │┌ @ simdloop.jl:71 within `macro expansion'
; ││┌ @ simdloop.jl:51 within `simd_inner_length'
; │││┌ @ range.jl:541 within `length'
; ││││┌ @ checked.jl:165 within `checked_add'
; │││││┌ @ checked.jl:132 within `add_with_overflow'
movq %rsi, %r9
incq %r9
; │││││└
; │││││ @ checked.jl:166 within `checked_add'
jo L2278
; │└└└└
; │┌ @ int.jl:49 within `macro expansion'
testq %r9, %r9
vmovapd %xmm0, 32(%rsp)
; └└
; ┌ @ simdloop.jl:72 within `logsumexp_sleefpirates!'
jle L107
movq (%r15), %rcx
movq (%r14), %rdx
vxorpd %xmm19, %xmm19, %xmm19
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
cmpq $32, %r8
jae L118
xorl %esi, %esi
jmp L1345
L107:
vxorpd %xmm19, %xmm19, %xmm19
jmp L1757
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
L118:
leaq (%rcx,%r8,8), %rsi
cmpq %rsi, %rdx
jae L143
leaq (%rdx,%r8,8), %rsi
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
cmpq %rsi, %rcx
jae L143
xorl %esi, %esi
jmp L1345
; │ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
L143:
movabsq $9223372036854775776, %rsi # imm = 0x7FFFFFFFFFFFFFE0
andq %r8, %rsi
vbroadcastsd %xmm0, %zmm0
vmovupd %zmm0, 64(%rsp)
vxorpd %xmm0, %xmm0, %xmm0
xorl %edi, %edi
movabsq $139756740332128, %rbx # imm = 0x7F1BA6DCCA60
vbroadcastsd (%rbx), %zmm1
vmovups %zmm1, 128(%rsp)
movabsq $139756740332136, %rbx # imm = 0x7F1BA6DCCA68
vbroadcastsd (%rbx), %zmm3
movabsq $139756740332144, %rbx # imm = 0x7F1BA6DCCA70
vbroadcastsd (%rbx), %zmm4
movabsq $139756740332152, %rbx # imm = 0x7F1BA6DCCA78
vbroadcastsd (%rbx), %zmm1
movabsq $139756740332160, %rbx # imm = 0x7F1BA6DCCA80
vbroadcastsd (%rbx), %zmm6
movabsq $139756740332168, %rbx # imm = 0x7F1BA6DCCA88
vbroadcastsd (%rbx), %zmm7
movabsq $139756740332176, %rbx # imm = 0x7F1BA6DCCA90
vbroadcastsd (%rbx), %zmm8
movabsq $139756740332184, %rbx # imm = 0x7F1BA6DCCA98
vbroadcastsd (%rbx), %zmm9
movabsq $139756740332192, %rbx # imm = 0x7F1BA6DCCAA0
vbroadcastsd (%rbx), %zmm10
movabsq $139756740332200, %rbx # imm = 0x7F1BA6DCCAA8
vbroadcastsd (%rbx), %zmm11
movabsq $139756740332208, %rbx # imm = 0x7F1BA6DCCAB0
vbroadcastsd (%rbx), %zmm12
movabsq $139756740332216, %rbx # imm = 0x7F1BA6DCCAB8
vbroadcastsd (%rbx), %zmm13
movabsq $139756740332224, %rbx # imm = 0x7F1BA6DCCAC0
vbroadcastsd (%rbx), %zmm14
movabsq $139756740332232, %rbx # imm = 0x7F1BA6DCCAC8
vbroadcastsd (%rbx), %zmm15
movabsq $139756740332240, %rbx # imm = 0x7F1BA6DCCAD0
vbroadcastsd (%rbx), %zmm16
vpbroadcastq (%rbx), %zmm17
vxorpd %xmm18, %xmm18, %xmm18
vxorpd %xmm19, %xmm19, %xmm19
vxorpd %xmm20, %xmm20, %xmm20
; └
; ┌ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:6
; ││┌ @ array.jl:728 within `getindex'
L448:
vmovupd (%rcx,%rdi,8), %zmm5
vmovupd 64(%rcx,%rdi,8), %zmm21
vmovupd 128(%rcx,%rdi,8), %zmm22
vmovupd 192(%rcx,%rdi,8), %zmm23
vmovupd 64(%rsp), %zmm2
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ float.jl:397
vsubpd %zmm2, %zmm5, %zmm5
vsubpd %zmm2, %zmm21, %zmm29
vsubpd %zmm2, %zmm22, %zmm30
vsubpd %zmm2, %zmm23, %zmm31
vmovupd 128(%rsp), %zmm2
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:7
; ││┌ @ exp.jl:181 within `exp'
; │││┌ @ float.jl:399 within `*'
vmulpd %zmm2, %zmm5, %zmm21
vmulpd %zmm2, %zmm29, %zmm22
vmulpd %zmm2, %zmm30, %zmm23
vmulpd %zmm2, %zmm31, %zmm24
; ││└└
; ││┌ @ float.jl:370 within `exp'
vrndscalepd $4, %zmm21, %zmm25
vrndscalepd $4, %zmm22, %zmm26
vrndscalepd $4, %zmm23, %zmm27
vrndscalepd $4, %zmm24, %zmm28
; ││└
; ││┌ @ exp.jl:183 within `exp'
; │││┌ @ float.jl:304 within `unsafe_trunc'
vcvttpd2qq %zmm25, %zmm21
vcvttpd2qq %zmm26, %zmm22
vcvttpd2qq %zmm27, %zmm23
vcvttpd2qq %zmm28, %zmm24
; ││└└
; ││┌ @ float.jl:404 within `exp'
vfnmadd231pd %zmm3, %zmm25, %zmm5
vfnmadd231pd %zmm3, %zmm26, %zmm29
vfnmadd231pd %zmm3, %zmm27, %zmm30
vfnmadd231pd %zmm3, %zmm28, %zmm31
; ││└
; ││┌ @ exp.jl:186 within `exp'
; │││┌ @ float.jl:404 within `muladd'
vfnmadd213pd %zmm5, %zmm4, %zmm25
vfnmadd213pd %zmm29, %zmm4, %zmm26
vfnmadd213pd %zmm30, %zmm4, %zmm27
vfnmadd213pd %zmm31, %zmm4, %zmm28
; │││└
; │││ @ exp.jl:188 within `exp'
; │││┌ @ exp.jl:161 within `exp_kernel'
; ││││┌ @ math.jl:101 within `macro expansion'
; │││││┌ @ float.jl:404 within `muladd'
vmovapd %zmm1, %zmm29
vfmadd213pd %zmm6, %zmm25, %zmm29
vmovapd %zmm1, %zmm30
vfmadd213pd %zmm6, %zmm26, %zmm30
vmovapd %zmm1, %zmm31
vfmadd213pd %zmm6, %zmm27, %zmm31
vmovapd %zmm1, %zmm5
vfmadd213pd %zmm6, %zmm28, %zmm5
vfmadd213pd %zmm7, %zmm25, %zmm29
vfmadd213pd %zmm7, %zmm26, %zmm30
vfmadd213pd %zmm7, %zmm27, %zmm31
vfmadd213pd %zmm7, %zmm28, %zmm5
vfmadd213pd %zmm8, %zmm25, %zmm29
vfmadd213pd %zmm8, %zmm26, %zmm30
vfmadd213pd %zmm8, %zmm27, %zmm31
vfmadd213pd %zmm8, %zmm28, %zmm5
vfmadd213pd %zmm9, %zmm25, %zmm29
vfmadd213pd %zmm9, %zmm26, %zmm30
vfmadd213pd %zmm9, %zmm27, %zmm31
vfmadd213pd %zmm9, %zmm28, %zmm5
vfmadd213pd %zmm10, %zmm25, %zmm29
vfmadd213pd %zmm10, %zmm26, %zmm30
vfmadd213pd %zmm10, %zmm27, %zmm31
vfmadd213pd %zmm10, %zmm28, %zmm5
vfmadd213pd %zmm11, %zmm25, %zmm29
vfmadd213pd %zmm11, %zmm26, %zmm30
vfmadd213pd %zmm11, %zmm27, %zmm31
vfmadd213pd %zmm11, %zmm28, %zmm5
vfmadd213pd %zmm12, %zmm25, %zmm29
vfmadd213pd %zmm12, %zmm26, %zmm30
vfmadd213pd %zmm12, %zmm27, %zmm31
vfmadd213pd %zmm12, %zmm28, %zmm5
vfmadd213pd %zmm13, %zmm25, %zmm29
vfmadd213pd %zmm13, %zmm26, %zmm30
vfmadd213pd %zmm13, %zmm27, %zmm31
vfmadd213pd %zmm13, %zmm28, %zmm5
vfmadd213pd %zmm14, %zmm25, %zmm29
vfmadd213pd %zmm14, %zmm26, %zmm30
vfmadd213pd %zmm14, %zmm27, %zmm31
vfmadd213pd %zmm14, %zmm28, %zmm5
vfmadd213pd %zmm15, %zmm25, %zmm29
vfmadd213pd %zmm15, %zmm26, %zmm30
vfmadd213pd %zmm15, %zmm27, %zmm31
vfmadd213pd %zmm15, %zmm28, %zmm5
; ││└└└└
; ││┌ @ float.jl:399 within `exp'
vmulpd %zmm25, %zmm25, %zmm2
vmulpd %zmm29, %zmm2, %zmm2
vmulpd %zmm26, %zmm26, %zmm29
vmulpd %zmm30, %zmm29, %zmm29
vmulpd %zmm27, %zmm27, %zmm30
vmulpd %zmm31, %zmm30, %zmm30
vmulpd %zmm28, %zmm28, %zmm31
vmulpd %zmm5, %zmm31, %zmm5
; ││└
; ││┌ @ exp.jl:189 within `exp'
; │││┌ @ operators.jl:529 within `+' @ float.jl:395
vaddpd %zmm2, %zmm25, %zmm2
vaddpd %zmm29, %zmm26, %zmm25
vaddpd %zmm30, %zmm27, %zmm26
vaddpd %zmm5, %zmm28, %zmm5
; │││└
; │││┌ @ float.jl:395 within `+'
vaddpd %zmm16, %zmm2, %zmm2
vaddpd %zmm16, %zmm25, %zmm25
vaddpd %zmm16, %zmm26, %zmm26
vaddpd %zmm16, %zmm5, %zmm5
; │││└
; │││ @ exp.jl:190 within `exp'
; │││┌ @ priv.jl:51 within `ldexp2k'
; ││││┌ @ int.jl:444 within `>>' @ int.jl:437
vpsraq $1, %zmm21, %zmm27
vpsraq $1, %zmm22, %zmm28
vpsraq $1, %zmm23, %zmm29
vpsraq $1, %zmm24, %zmm30
; ││││└
; ││││ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm27, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; │││└└└└
; │││┌ @ float.jl:399 within `ldexp2k'
vmulpd %zmm31, %zmm2, %zmm2
; │││└
; │││┌ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm28, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulpd %zmm31, %zmm25, %zmm25
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm29, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulpd %zmm31, %zmm26, %zmm26
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm30, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulpd %zmm31, %zmm5, %zmm5
; ││││└└
; ││││┌ @ int.jl:52 within `-'
vpsubq %zmm27, %zmm21, %zmm21
vpsubq %zmm28, %zmm22, %zmm22
vpsubq %zmm29, %zmm23, %zmm23
vpsubq %zmm30, %zmm24, %zmm24
; │││└└
; │││┌ @ int.jl:439 within `ldexp2k'
vpsllq $52, %zmm21, %zmm21
vpaddq %zmm17, %zmm21, %zmm21
; │││└
; │││┌ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulpd %zmm21, %zmm2, %zmm2
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm22, %zmm21
vpaddq %zmm17, %zmm21, %zmm21
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulpd %zmm21, %zmm25, %zmm21
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm23, %zmm22
vpaddq %zmm17, %zmm22, %zmm22
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulpd %zmm22, %zmm26, %zmm22
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm24, %zmm23
vpaddq %zmm17, %zmm23, %zmm23
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulpd %zmm23, %zmm5, %zmm5
; ││└└└└
; ││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovupd %zmm2, (%rdx,%rdi,8)
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
vaddpd %zmm2, %zmm0, %zmm0
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
vmovupd %zmm21, 64(%rdx,%rdi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
vaddpd %zmm21, %zmm18, %zmm18
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
vmovupd %zmm22, 128(%rdx,%rdi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
vaddpd %zmm22, %zmm19, %zmm19
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
vmovupd %zmm5, 192(%rdx,%rdi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
vaddpd %zmm5, %zmm20, %zmm20
; │└└
; │┌ @ int.jl:53 within `macro expansion'
addq $32, %rdi
cmpq %rdi, %rsi
jne L448
; │└
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; ││┌ @ float.jl:395 within `+'
vaddpd %zmm0, %zmm18, %zmm0
vaddpd %zmm0, %zmm19, %zmm0
vaddpd %zmm0, %zmm20, %zmm0
vextractf64x4 $1, %zmm0, %ymm1
vaddpd %zmm1, %zmm0, %zmm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %zmm1, %zmm0, %zmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddpd %zmm1, %zmm0, %zmm19
cmpq %rsi, %r8
vmovapd 32(%rsp), %xmm0
; └└└
; ┌ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
je L1757
L1345:
movabsq $4607182418800017408, %rdi # imm = 0x3FF0000000000000
movabsq $139756740332128, %rbx # imm = 0x7F1BA6DCCA60
vmovsd (%rbx), %xmm8 # xmm8 = mem[0],zero
movabsq $139756740332248, %rbx # imm = 0x7F1BA6DCCAD8
vmovsd (%rbx), %xmm9 # xmm9 = mem[0],zero
movabsq $139756740332256, %rbx # imm = 0x7F1BA6DCCAE0
vmovsd (%rbx), %xmm10 # xmm10 = mem[0],zero
movabsq $139756740332152, %rbx # imm = 0x7F1BA6DCCA78
vmovsd (%rbx), %xmm11 # xmm11 = mem[0],zero
movabsq $139756740332160, %rbx # imm = 0x7F1BA6DCCA80
vmovsd (%rbx), %xmm12 # xmm12 = mem[0],zero
movabsq $139756740332168, %rbx # imm = 0x7F1BA6DCCA88
vmovsd (%rbx), %xmm13 # xmm13 = mem[0],zero
movabsq $139756740332176, %rbx # imm = 0x7F1BA6DCCA90
vmovsd (%rbx), %xmm14 # xmm14 = mem[0],zero
movabsq $139756740332184, %rbx # imm = 0x7F1BA6DCCA98
vmovsd (%rbx), %xmm15 # xmm15 = mem[0],zero
movabsq $139756740332192, %rbx # imm = 0x7F1BA6DCCAA0
vmovsd (%rbx), %xmm16 # xmm16 = mem[0],zero
movabsq $139756740332200, %rbx # imm = 0x7F1BA6DCCAA8
vmovsd (%rbx), %xmm17 # xmm17 = mem[0],zero
movabsq $139756740332208, %rbx # imm = 0x7F1BA6DCCAB0
vmovsd (%rbx), %xmm18 # xmm18 = mem[0],zero
movabsq $139756740332216, %rbx # imm = 0x7F1BA6DCCAB8
vmovsd (%rbx), %xmm3 # xmm3 = mem[0],zero
movabsq $139756740332224, %rbx # imm = 0x7F1BA6DCCAC0
vmovsd (%rbx), %xmm4 # xmm4 = mem[0],zero
movabsq $139756740332232, %rbx # imm = 0x7F1BA6DCCAC8
vmovsd (%rbx), %xmm5 # xmm5 = mem[0],zero
movabsq $139756740332240, %rbx # imm = 0x7F1BA6DCCAD0
vmovsd (%rbx), %xmm6 # xmm6 = mem[0],zero
nopw %cs:(%rax,%rax)
; └
; ┌ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:6
; ││┌ @ array.jl:728 within `getindex'
L1584:
vmovsd (%rcx,%rsi,8), %xmm7 # xmm7 = mem[0],zero
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ float.jl:397
vsubsd %xmm0, %xmm7, %xmm7
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:7
; ││┌ @ exp.jl:181 within `exp'
; │││┌ @ float.jl:399 within `*'
vmulsd %xmm8, %xmm7, %xmm1
; │││└
; │││┌ @ floatfuncs.jl:129 within `round' @ float.jl:370
vrndscalesd $4, %xmm1, %xmm1, %xmm1
; ││└└
; ││┌ @ float.jl:304 within `exp'
vcvttsd2si %xmm1, %rax
; ││└
; ││┌ @ exp.jl:185 within `exp'
; │││┌ @ float.jl:404 within `muladd'
vfmadd231sd %xmm9, %xmm1, %xmm7
; │││└
; │││ @ exp.jl:186 within `exp'
; │││┌ @ float.jl:404 within `muladd'
vfmadd231sd %xmm10, %xmm1, %xmm7
; │││└
; │││ @ exp.jl:188 within `exp'
; │││┌ @ exp.jl:161 within `exp_kernel'
; ││││┌ @ math.jl:101 within `macro expansion'
; │││││┌ @ float.jl:404 within `muladd'
vmovapd %xmm11, %xmm2
vfmadd213sd %xmm12, %xmm7, %xmm2
vfmadd213sd %xmm13, %xmm7, %xmm2
vfmadd213sd %xmm14, %xmm7, %xmm2
vfmadd213sd %xmm15, %xmm7, %xmm2
vfmadd213sd %xmm16, %xmm7, %xmm2
vfmadd213sd %xmm17, %xmm7, %xmm2
vfmadd213sd %xmm18, %xmm7, %xmm2
vfmadd213sd %xmm3, %xmm7, %xmm2
vfmadd213sd %xmm4, %xmm7, %xmm2
vfmadd213sd %xmm5, %xmm7, %xmm2
; ││└└└└
; ││┌ @ float.jl:399 within `exp'
vmulsd %xmm7, %xmm7, %xmm1
vmulsd %xmm2, %xmm1, %xmm1
; ││└
; ││┌ @ exp.jl:189 within `exp'
; │││┌ @ operators.jl:529 within `+' @ float.jl:395
vaddsd %xmm1, %xmm7, %xmm1
; │││└
; │││┌ @ float.jl:395 within `+'
vaddsd %xmm6, %xmm1, %xmm2
; ││└└
; ││┌ @ int.jl:437 within `exp'
movq %rax, %rbx
sarq %rbx
; ││└
; ││┌ @ exp.jl:190 within `exp'
; │││┌ @ int.jl:52 within `ldexp2k'
subl %ebx, %eax
; │││└
; │││┌ @ priv.jl:52 within `ldexp2k'
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
shlq $52, %rbx
addq %rdi, %rbx
; │││││└└
; │││││┌ @ essentials.jl:417 within `integer2float'
vmovq %rbx, %xmm1
; ││││└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulsd %xmm1, %xmm2, %xmm2
; ││││└└
; ││││┌ @ utils.jl:51 within `pow2i'
; │││││┌ @ utils.jl:20 within `integer2float'
; ││││││┌ @ int.jl:446 within `<<' @ int.jl:439
shlq $52, %rax
addq %rdi, %rax
; ││││││└
; ││││││┌ @ essentials.jl:417 within `reinterpret'
vmovq %rax, %xmm1
; ││││└└└
; ││││┌ @ floating_point_arithmetic.jl:62 within `evmul'
; │││││┌ @ float.jl:399 within `*'
vmulsd %xmm1, %xmm2, %xmm1
; ││└└└└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; ││┌ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rdx,%rsi,8)
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ float.jl:395
vaddsd %xmm1, %xmm19, %xmm19
; ││ @ simdloop.jl:78 within `macro expansion'
; ││┌ @ int.jl:53 within `+'
addq $1, %rsi
; ││└
; ││ @ simdloop.jl:75 within `macro expansion'
; ││┌ @ int.jl:49 within `<'
cmpq %r9, %rsi
; ││└
jb L1584
; │└
; │ @ REPL[43]:11 within `logsumexp_sleefpirates!'
L1757:
movabsq $"<;", %rax
vmovupd %zmm19, 64(%rsp)
vmovapd %xmm19, %xmm0
vzeroupper
callq *%rax
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ abstractarray.jl:75 within `axes'
; │││┌ @ array.jl:155 within `size'
movq 24(%r15), %rdx
; ││└└
; ││┌ @ promotion.jl:414 within `axes'
testq %rdx, %rdx
; ││└
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:72 within `macro expansion'
jle L2039
; └└└└
; ┌ @ simdloop.jl within `logsumexp_sleefpirates!'
movq %rdx, %rax
sarq $63, %rax
andnq %rdx, %rax, %rax
; └
; ┌ @ REPL[43]:11 within `logsumexp_sleefpirates!'
; │┌ @ float.jl:395 within `+'
vaddsd 32(%rsp), %xmm0, %xmm0
movq (%r15), %rcx
; │└
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:886
; │││┌ @ broadcast.jl:869 within `preprocess'
; ││││┌ @ broadcast.jl:872 within `preprocess_args'
; │││││┌ @ broadcast.jl:870 within `preprocess'
; ││││││┌ @ broadcast.jl:592 within `extrude'
; │││││││┌ @ broadcast.jl:547 within `newindexer'
; ││││││││┌ @ broadcast.jl:548 within `shapeindexer'
; │││││││││┌ @ broadcast.jl:553 within `_newindexer'
; ││││││││││┌ @ operators.jl:193 within `!='
; │││││││││││┌ @ promotion.jl:403 within `=='
cmpq $1, %rdx
; │││└└└└└└└└└
; │││ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:75 within `macro expansion'
jne L1867
xorl %edx, %edx
nopw %cs:(%rax,%rax)
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:597 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:621 within `_getindex'
; │││││││┌ @ broadcast.jl:591 within `_broadcast_getindex'
; ││││││││┌ @ array.jl:728 within `getindex'
L1840:
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
; ││││││└└└
; ││││││ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:397 within `-'
vsubsd %xmm0, %xmm1, %xmm1
; ││││└└└└
; ││││┌ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rcx,%rdx,8)
; │││└└
; │││┌ @ int.jl:53 within `macro expansion'
addq $1, %rdx
; │││└
; │││┌ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
cmpq %rax, %rdx
; ││││└
jb L1840
jmp L2039
L1867:
cmpq $32, %rax
jae L1880
xorl %edx, %edx
jmp L2016
; ││││ @ simdloop.jl:75 within `macro expansion'
L1880:
movabsq $9223372036854775776, %rdx # imm = 0x7FFFFFFFFFFFFFE0
andq %rax, %rdx
vbroadcastsd %xmm0, %zmm1
leaq 192(%rcx), %rsi
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
movq %rdx, %rdi
nopw %cs:(%rax,%rax)
; ││││└
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:597 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:621 within `_getindex'
; │││││││┌ @ broadcast.jl:591 within `_broadcast_getindex'
; ││││││││┌ @ array.jl:728 within `getindex'
L1920:
vmovupd -192(%rsi), %zmm2
vmovupd -128(%rsi), %zmm3
vmovupd -64(%rsi), %zmm4
vmovupd (%rsi), %zmm5
; ││││││└└└
; ││││││ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:397 within `-'
vsubpd %zmm1, %zmm2, %zmm2
vsubpd %zmm1, %zmm3, %zmm3
vsubpd %zmm1, %zmm4, %zmm4
vsubpd %zmm1, %zmm5, %zmm5
; ││││└└└└
; ││││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovupd %zmm2, -192(%rsi)
vmovupd %zmm3, -128(%rsi)
vmovupd %zmm4, -64(%rsi)
vmovupd %zmm5, (%rsi)
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
addq $256, %rsi # imm = 0x100
addq $-32, %rdi
jne L1920
; └└└└└
; ┌ @ int.jl within `logsumexp_sleefpirates!'
cmpq %rdx, %rax
; └
; ┌ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
je L2039
; └
; ┌ @ REPL[43]:11 within `logsumexp_sleefpirates!'
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:597 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:621 within `_getindex'
; │││││││┌ @ broadcast.jl:591 within `_broadcast_getindex'
; ││││││││┌ @ array.jl:728 within `getindex'
L2016:
vmovsd (%rcx,%rdx,8), %xmm1 # xmm1 = mem[0],zero
; │││││└└└└
; │││││┌ @ float.jl:397 within `_broadcast_getindex'
vsubsd %xmm0, %xmm1, %xmm1
; ││││└└
; ││││┌ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rcx,%rdx,8)
; ││││└
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
addq $1, %rdx
; ││││└
; ││││ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
cmpq %rax, %rdx
jb L2016
; │└└└└
; │ @ REPL[43]:12 within `logsumexp_sleefpirates!'
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ abstractarray.jl:75 within `axes'
; │││┌ @ array.jl:155 within `size'
L2039:
movq 24(%r14), %rdx
; ││└└
; ││┌ @ promotion.jl:414 within `axes'
testq %rdx, %rdx
; ││└
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:72 within `macro expansion'
jle L2259
; └└└└
; ┌ @ simdloop.jl within `logsumexp_sleefpirates!'
movq %rdx, %rax
sarq $63, %rax
andnq %rdx, %rax, %rax
movabsq $139756740332240, %rcx # imm = 0x7F1BA6DCCAD0
; └
; ┌ @ REPL[43]:12 within `logsumexp_sleefpirates!'
; │┌ @ promotion.jl:316 within `/' @ float.jl:401
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
vdivsd 64(%rsp), %xmm0, %xmm0
movq (%r14), %rcx
; │└
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:886
; │││┌ @ broadcast.jl:869 within `preprocess'
; ││││┌ @ broadcast.jl:872 within `preprocess_args'
; │││││┌ @ broadcast.jl:870 within `preprocess'
; ││││││┌ @ broadcast.jl:592 within `extrude'
; │││││││┌ @ broadcast.jl:547 within `newindexer'
; ││││││││┌ @ broadcast.jl:548 within `shapeindexer'
; │││││││││┌ @ broadcast.jl:553 within `_newindexer'
; ││││││││││┌ @ operators.jl:193 within `!='
; │││││││││││┌ @ promotion.jl:403 within `=='
cmpq $1, %rdx
; │││└└└└└└└└└
; │││ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:75 within `macro expansion'
jne L2119
xorl %edx, %edx
nop
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:399 within `*'
L2096:
vmulsd (%rcx), %xmm0, %xmm1
; ││││└└└└
; ││││┌ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rcx,%rdx,8)
; │││└└
; │││┌ @ int.jl:53 within `macro expansion'
addq $1, %rdx
; │││└
; │││┌ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
cmpq %rax, %rdx
; ││││└
jb L2096
jmp L2259
L2119:
cmpq $32, %rax
jae L2129
xorl %edx, %edx
jmp L2240
; ││││ @ simdloop.jl:75 within `macro expansion'
L2129:
movabsq $9223372036854775776, %rdx # imm = 0x7FFFFFFFFFFFFFE0
andq %rax, %rdx
vbroadcastsd %xmm0, %zmm1
leaq 192(%rcx), %rsi
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
movq %rdx, %rdi
nop
; ││││└
; ││││ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:399 within `*'
L2160:
vmulpd -192(%rsi), %zmm1, %zmm2
vmulpd -128(%rsi), %zmm1, %zmm3
vmulpd -64(%rsi), %zmm1, %zmm4
vmulpd (%rsi), %zmm1, %zmm5
; ││││└└└└
; ││││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovupd %zmm2, -192(%rsi)
vmovupd %zmm3, -128(%rsi)
vmovupd %zmm4, -64(%rsi)
vmovupd %zmm5, (%rsi)
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
addq $256, %rsi # imm = 0x100
addq $-32, %rdi
jne L2160
; └└└└└
; ┌ @ int.jl within `logsumexp_sleefpirates!'
cmpq %rdx, %rax
; └
; ┌ @ simdloop.jl:75 within `logsumexp_sleefpirates!'
je L2259
nopl (%rax,%rax)
; └
; ┌ @ REPL[43]:12 within `logsumexp_sleefpirates!'
; │┌ @ broadcast.jl:801 within `materialize!'
; ││┌ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; │││┌ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; ││││┌ @ broadcast.jl:558 within `getindex'
; │││││┌ @ broadcast.jl:598 within `_broadcast_getindex'
; ││││││┌ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; │││││││┌ @ float.jl:399 within `*'
L2240:
vmulsd (%rcx,%rdx,8), %xmm0, %xmm1
; ││││└└└└
; ││││ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovsd %xmm1, (%rcx,%rdx,8)
; ││││ @ simdloop.jl:78 within `macro expansion'
; ││││┌ @ int.jl:53 within `+'
addq $1, %rdx
; ││││└
; ││││ @ simdloop.jl:75 within `macro expansion'
; ││││┌ @ int.jl:49 within `<'
cmpq %rax, %rdx
jb L2240
; │└└└└
L2259:
movq %r14, %rax
addq $224, %rsp
popq %rbx
popq %r14
popq %r15
vzeroupper
retq
; │ @ REPL[43]:5 within `logsumexp_sleefpirates!'
; │┌ @ simdloop.jl:71 within `macro expansion'
; ││┌ @ simdloop.jl:51 within `simd_inner_length'
; │││┌ @ range.jl:541 within `length'
; ││││┌ @ checked.jl:166 within `checked_add'
L2278:
movabsq $throw_overflowerr_binaryop, %rax
movabsq $139757138773328, %rdi # imm = 0x7F1BBE9C8550
movl $1, %edx
callq *%rax
ud2
nopw %cs:(%rax,%rax)
; └└└└└
SLEEFPirates is vectorized.