LLVM. GCC will vectorize log/exp/sin/etc with the appropriate optimization flags, but LLVM needs those in addition to -fveclib=SVML
or some other vector library.
using SIMDPirates, SLEEFPirates, LoopVectorization
function logsumexp_simdpirates!(w::Vector{T},we) where T
offset = maximum(w)
N = length(w)
sl = SIMDPirates.Vec{4,T}((0.,0.,0.,0.))
@inbounds @simd for i = 1:4:N
wl = SIMDPirates.vload(SIMDPirates.Vec{4,T}, w, i)
@pirate wel = SLEEFPirates.exp(wl-offset)
@pirate sl += wel
SIMDPirates.vstore!(we,wel,i)
end
s = vsum(sl)
w .-= log(s) + offset
we .*= 1/s
end
function logsumexp_loopvectorization!(w::Vector{T},we) where T
offset = maximum(w)
N = length(w)
s = zero(T)
@vectorize for i = 1:N
wl = w[i]
wel = exp(wl-offset)
we[i] = wel
s += wel
end
w .-= log(s) + offset
we .*= 1/s
end
function logsumexp_sleefpirates!(w::Vector{T},we) where T
offset = maximum(w)
N = length(w)
s = zero(T)
@inbounds @simd for i = 1:N
wl = w[i]
wel = SLEEFPirates.exp(wl-offset)
we[i] = wel
s += wel
end
w .-= log(s) + offset
we .*= 1/s
end
A few results:
julia> @btime logsumexp!($w,$we);
7.844 ΞΌs (0 allocations: 0 bytes)
julia> @btime logsumexp_yeppp!($w,$we);
9.393 ΞΌs (0 allocations: 0 bytes)
julia> @btime logsumexp_simdpirates!($w,$we);
2.576 ΞΌs (0 allocations: 0 bytes)
julia> @btime logsumexp_loopvectorization!($w,$we);
1.825 ΞΌs (0 allocations: 0 bytes)
julia> @btime logsumexp_sleefpirates!($w,$we);
1.705 ΞΌs (0 allocations: 0 bytes)
julia> @code_native logsumexp_sleefpirates!(w,we)
.text
; β @ REPL[43]:2 within `logsumexp_sleefpirates!'
pushq %r15
pushq %r14
pushq %rbx
subq $224, %rsp
movq %rsi, 56(%rsp)
movq (%rsi), %r15
movq 8(%rsi), %r14
; ββ @ reducedim.jl:652 within `maximum'
; βββ @ reducedim.jl:652 within `#maximum#562'
; ββββ @ reducedim.jl:656 within `_maximum' @ reducedim.jl:657
; βββββ @ reducedim.jl:307 within `mapreduce'
; ββββββ @ reducedim.jl:307 within `#mapreduce#555'
; βββββββ @ reducedim.jl:312 within `_mapreduce_dim'
movabsq $"size;", %rax
movq %r15, %rdi
callq *%rax
; βββββββ
; β @ REPL[43]:3 within `logsumexp_sleefpirates!'
; ββ @ array.jl:200 within `length'
movq 8(%r15), %rax
; ββ
; β @ REPL[43]:5 within `logsumexp_sleefpirates!'
; ββ @ simdloop.jl:69 within `macro expansion'
; βββ @ range.jl:5 within `Colon'
; ββββ @ range.jl:275 within `Type'
; βββββ @ range.jl:280 within `unitrange_last'
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %r8
; βββββ
; ββ @ checked.jl:194 within `macro expansion'
leaq -1(%r8), %rsi
; ββ
; ββ @ simdloop.jl:71 within `macro expansion'
; βββ @ simdloop.jl:51 within `simd_inner_length'
; ββββ @ range.jl:541 within `length'
; βββββ @ checked.jl:165 within `checked_add'
; ββββββ @ checked.jl:132 within `add_with_overflow'
movq %rsi, %r9
incq %r9
; ββββββ
; βββββ @ checked.jl:166 within `checked_add'
jo L2278
; βββββ
; ββ @ int.jl:49 within `macro expansion'
testq %r9, %r9
vmovapd %xmm0, 32(%rsp)
; ββ
; β @ simdloop.jl:72 within `logsumexp_sleefpirates!'
jle L107
movq (%r15), %rcx
movq (%r14), %rdx
vxorpd %xmm19, %xmm19, %xmm19
; β @ simdloop.jl:75 within `logsumexp_sleefpirates!'
cmpq $32, %r8
jae L118
xorl %esi, %esi
jmp L1345
L107:
vxorpd %xmm19, %xmm19, %xmm19
jmp L1757
; β @ simdloop.jl:75 within `logsumexp_sleefpirates!'
L118:
leaq (%rcx,%r8,8), %rsi
cmpq %rsi, %rdx
jae L143
leaq (%rdx,%r8,8), %rsi
; β @ simdloop.jl:75 within `logsumexp_sleefpirates!'
cmpq %rsi, %rcx
jae L143
xorl %esi, %esi
jmp L1345
; β @ simdloop.jl:75 within `logsumexp_sleefpirates!'
L143:
movabsq $9223372036854775776, %rsi # imm = 0x7FFFFFFFFFFFFFE0
andq %r8, %rsi
vbroadcastsd %xmm0, %zmm0
vmovupd %zmm0, 64(%rsp)
vxorpd %xmm0, %xmm0, %xmm0
xorl %edi, %edi
movabsq $139756740332128, %rbx # imm = 0x7F1BA6DCCA60
vbroadcastsd (%rbx), %zmm1
vmovups %zmm1, 128(%rsp)
movabsq $139756740332136, %rbx # imm = 0x7F1BA6DCCA68
vbroadcastsd (%rbx), %zmm3
movabsq $139756740332144, %rbx # imm = 0x7F1BA6DCCA70
vbroadcastsd (%rbx), %zmm4
movabsq $139756740332152, %rbx # imm = 0x7F1BA6DCCA78
vbroadcastsd (%rbx), %zmm1
movabsq $139756740332160, %rbx # imm = 0x7F1BA6DCCA80
vbroadcastsd (%rbx), %zmm6
movabsq $139756740332168, %rbx # imm = 0x7F1BA6DCCA88
vbroadcastsd (%rbx), %zmm7
movabsq $139756740332176, %rbx # imm = 0x7F1BA6DCCA90
vbroadcastsd (%rbx), %zmm8
movabsq $139756740332184, %rbx # imm = 0x7F1BA6DCCA98
vbroadcastsd (%rbx), %zmm9
movabsq $139756740332192, %rbx # imm = 0x7F1BA6DCCAA0
vbroadcastsd (%rbx), %zmm10
movabsq $139756740332200, %rbx # imm = 0x7F1BA6DCCAA8
vbroadcastsd (%rbx), %zmm11
movabsq $139756740332208, %rbx # imm = 0x7F1BA6DCCAB0
vbroadcastsd (%rbx), %zmm12
movabsq $139756740332216, %rbx # imm = 0x7F1BA6DCCAB8
vbroadcastsd (%rbx), %zmm13
movabsq $139756740332224, %rbx # imm = 0x7F1BA6DCCAC0
vbroadcastsd (%rbx), %zmm14
movabsq $139756740332232, %rbx # imm = 0x7F1BA6DCCAC8
vbroadcastsd (%rbx), %zmm15
movabsq $139756740332240, %rbx # imm = 0x7F1BA6DCCAD0
vbroadcastsd (%rbx), %zmm16
vpbroadcastq (%rbx), %zmm17
vxorpd %xmm18, %xmm18, %xmm18
vxorpd %xmm19, %xmm19, %xmm19
vxorpd %xmm20, %xmm20, %xmm20
; β
; β @ REPL[43]:5 within `logsumexp_sleefpirates!'
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:6
; βββ @ array.jl:728 within `getindex'
L448:
vmovupd (%rcx,%rdi,8), %zmm5
vmovupd 64(%rcx,%rdi,8), %zmm21
vmovupd 128(%rcx,%rdi,8), %zmm22
vmovupd 192(%rcx,%rdi,8), %zmm23
vmovupd 64(%rsp), %zmm2
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ float.jl:397
vsubpd %zmm2, %zmm5, %zmm5
vsubpd %zmm2, %zmm21, %zmm29
vsubpd %zmm2, %zmm22, %zmm30
vsubpd %zmm2, %zmm23, %zmm31
vmovupd 128(%rsp), %zmm2
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:7
; βββ @ exp.jl:181 within `exp'
; ββββ @ float.jl:399 within `*'
vmulpd %zmm2, %zmm5, %zmm21
vmulpd %zmm2, %zmm29, %zmm22
vmulpd %zmm2, %zmm30, %zmm23
vmulpd %zmm2, %zmm31, %zmm24
; ββββ
; βββ @ float.jl:370 within `exp'
vrndscalepd $4, %zmm21, %zmm25
vrndscalepd $4, %zmm22, %zmm26
vrndscalepd $4, %zmm23, %zmm27
vrndscalepd $4, %zmm24, %zmm28
; βββ
; βββ @ exp.jl:183 within `exp'
; ββββ @ float.jl:304 within `unsafe_trunc'
vcvttpd2qq %zmm25, %zmm21
vcvttpd2qq %zmm26, %zmm22
vcvttpd2qq %zmm27, %zmm23
vcvttpd2qq %zmm28, %zmm24
; ββββ
; βββ @ float.jl:404 within `exp'
vfnmadd231pd %zmm3, %zmm25, %zmm5
vfnmadd231pd %zmm3, %zmm26, %zmm29
vfnmadd231pd %zmm3, %zmm27, %zmm30
vfnmadd231pd %zmm3, %zmm28, %zmm31
; βββ
; βββ @ exp.jl:186 within `exp'
; ββββ @ float.jl:404 within `muladd'
vfnmadd213pd %zmm5, %zmm4, %zmm25
vfnmadd213pd %zmm29, %zmm4, %zmm26
vfnmadd213pd %zmm30, %zmm4, %zmm27
vfnmadd213pd %zmm31, %zmm4, %zmm28
; ββββ
; βββ @ exp.jl:188 within `exp'
; ββββ @ exp.jl:161 within `exp_kernel'
; βββββ @ math.jl:101 within `macro expansion'
; ββββββ @ float.jl:404 within `muladd'
vmovapd %zmm1, %zmm29
vfmadd213pd %zmm6, %zmm25, %zmm29
vmovapd %zmm1, %zmm30
vfmadd213pd %zmm6, %zmm26, %zmm30
vmovapd %zmm1, %zmm31
vfmadd213pd %zmm6, %zmm27, %zmm31
vmovapd %zmm1, %zmm5
vfmadd213pd %zmm6, %zmm28, %zmm5
vfmadd213pd %zmm7, %zmm25, %zmm29
vfmadd213pd %zmm7, %zmm26, %zmm30
vfmadd213pd %zmm7, %zmm27, %zmm31
vfmadd213pd %zmm7, %zmm28, %zmm5
vfmadd213pd %zmm8, %zmm25, %zmm29
vfmadd213pd %zmm8, %zmm26, %zmm30
vfmadd213pd %zmm8, %zmm27, %zmm31
vfmadd213pd %zmm8, %zmm28, %zmm5
vfmadd213pd %zmm9, %zmm25, %zmm29
vfmadd213pd %zmm9, %zmm26, %zmm30
vfmadd213pd %zmm9, %zmm27, %zmm31
vfmadd213pd %zmm9, %zmm28, %zmm5
vfmadd213pd %zmm10, %zmm25, %zmm29
vfmadd213pd %zmm10, %zmm26, %zmm30
vfmadd213pd %zmm10, %zmm27, %zmm31
vfmadd213pd %zmm10, %zmm28, %zmm5
vfmadd213pd %zmm11, %zmm25, %zmm29
vfmadd213pd %zmm11, %zmm26, %zmm30
vfmadd213pd %zmm11, %zmm27, %zmm31
vfmadd213pd %zmm11, %zmm28, %zmm5
vfmadd213pd %zmm12, %zmm25, %zmm29
vfmadd213pd %zmm12, %zmm26, %zmm30
vfmadd213pd %zmm12, %zmm27, %zmm31
vfmadd213pd %zmm12, %zmm28, %zmm5
vfmadd213pd %zmm13, %zmm25, %zmm29
vfmadd213pd %zmm13, %zmm26, %zmm30
vfmadd213pd %zmm13, %zmm27, %zmm31
vfmadd213pd %zmm13, %zmm28, %zmm5
vfmadd213pd %zmm14, %zmm25, %zmm29
vfmadd213pd %zmm14, %zmm26, %zmm30
vfmadd213pd %zmm14, %zmm27, %zmm31
vfmadd213pd %zmm14, %zmm28, %zmm5
vfmadd213pd %zmm15, %zmm25, %zmm29
vfmadd213pd %zmm15, %zmm26, %zmm30
vfmadd213pd %zmm15, %zmm27, %zmm31
vfmadd213pd %zmm15, %zmm28, %zmm5
; ββββββ
; βββ @ float.jl:399 within `exp'
vmulpd %zmm25, %zmm25, %zmm2
vmulpd %zmm29, %zmm2, %zmm2
vmulpd %zmm26, %zmm26, %zmm29
vmulpd %zmm30, %zmm29, %zmm29
vmulpd %zmm27, %zmm27, %zmm30
vmulpd %zmm31, %zmm30, %zmm30
vmulpd %zmm28, %zmm28, %zmm31
vmulpd %zmm5, %zmm31, %zmm5
; βββ
; βββ @ exp.jl:189 within `exp'
; ββββ @ operators.jl:529 within `+' @ float.jl:395
vaddpd %zmm2, %zmm25, %zmm2
vaddpd %zmm29, %zmm26, %zmm25
vaddpd %zmm30, %zmm27, %zmm26
vaddpd %zmm5, %zmm28, %zmm5
; ββββ
; ββββ @ float.jl:395 within `+'
vaddpd %zmm16, %zmm2, %zmm2
vaddpd %zmm16, %zmm25, %zmm25
vaddpd %zmm16, %zmm26, %zmm26
vaddpd %zmm16, %zmm5, %zmm5
; ββββ
; βββ @ exp.jl:190 within `exp'
; ββββ @ priv.jl:51 within `ldexp2k'
; βββββ @ int.jl:444 within `>>' @ int.jl:437
vpsraq $1, %zmm21, %zmm27
vpsraq $1, %zmm22, %zmm28
vpsraq $1, %zmm23, %zmm29
vpsraq $1, %zmm24, %zmm30
; βββββ
; ββββ @ priv.jl:52 within `ldexp2k'
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm27, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; βββββββ
; ββββ @ float.jl:399 within `ldexp2k'
vmulpd %zmm31, %zmm2, %zmm2
; ββββ
; ββββ @ priv.jl:52 within `ldexp2k'
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm28, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; βββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulpd %zmm31, %zmm25, %zmm25
; ββββββ
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm29, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; βββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulpd %zmm31, %zmm26, %zmm26
; ββββββ
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm30, %zmm31
vpaddq %zmm17, %zmm31, %zmm31
; βββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulpd %zmm31, %zmm5, %zmm5
; ββββββ
; βββββ @ int.jl:52 within `-'
vpsubq %zmm27, %zmm21, %zmm21
vpsubq %zmm28, %zmm22, %zmm22
vpsubq %zmm29, %zmm23, %zmm23
vpsubq %zmm30, %zmm24, %zmm24
; βββββ
; ββββ @ int.jl:439 within `ldexp2k'
vpsllq $52, %zmm21, %zmm21
vpaddq %zmm17, %zmm21, %zmm21
; ββββ
; ββββ @ priv.jl:52 within `ldexp2k'
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulpd %zmm21, %zmm2, %zmm2
; ββββββ
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm22, %zmm21
vpaddq %zmm17, %zmm21, %zmm21
; βββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulpd %zmm21, %zmm25, %zmm21
; ββββββ
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm23, %zmm22
vpaddq %zmm17, %zmm22, %zmm22
; βββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulpd %zmm22, %zmm26, %zmm22
; ββββββ
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
vpsllq $52, %zmm24, %zmm23
vpaddq %zmm17, %zmm23, %zmm23
; βββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulpd %zmm23, %zmm5, %zmm5
; ββββββ
; ββ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovupd %zmm2, (%rdx,%rdi,8)
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; βββ @ float.jl:395 within `+'
vaddpd %zmm2, %zmm0, %zmm0
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; βββ @ array.jl:766 within `setindex!'
vmovupd %zmm21, 64(%rdx,%rdi,8)
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; βββ @ float.jl:395 within `+'
vaddpd %zmm21, %zmm18, %zmm18
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; βββ @ array.jl:766 within `setindex!'
vmovupd %zmm22, 128(%rdx,%rdi,8)
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; βββ @ float.jl:395 within `+'
vaddpd %zmm22, %zmm19, %zmm19
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; βββ @ array.jl:766 within `setindex!'
vmovupd %zmm5, 192(%rdx,%rdi,8)
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; βββ @ float.jl:395 within `+'
vaddpd %zmm5, %zmm20, %zmm20
; βββ
; ββ @ int.jl:53 within `macro expansion'
addq $32, %rdi
cmpq %rdi, %rsi
jne L448
; ββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:9
; βββ @ float.jl:395 within `+'
vaddpd %zmm0, %zmm18, %zmm0
vaddpd %zmm0, %zmm19, %zmm0
vaddpd %zmm0, %zmm20, %zmm0
vextractf64x4 $1, %zmm0, %ymm1
vaddpd %zmm1, %zmm0, %zmm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %zmm1, %zmm0, %zmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddpd %zmm1, %zmm0, %zmm19
cmpq %rsi, %r8
vmovapd 32(%rsp), %xmm0
; βββ
; β @ simdloop.jl:75 within `logsumexp_sleefpirates!'
je L1757
L1345:
movabsq $4607182418800017408, %rdi # imm = 0x3FF0000000000000
movabsq $139756740332128, %rbx # imm = 0x7F1BA6DCCA60
vmovsd (%rbx), %xmm8 # xmm8 = mem[0],zero
movabsq $139756740332248, %rbx # imm = 0x7F1BA6DCCAD8
vmovsd (%rbx), %xmm9 # xmm9 = mem[0],zero
movabsq $139756740332256, %rbx # imm = 0x7F1BA6DCCAE0
vmovsd (%rbx), %xmm10 # xmm10 = mem[0],zero
movabsq $139756740332152, %rbx # imm = 0x7F1BA6DCCA78
vmovsd (%rbx), %xmm11 # xmm11 = mem[0],zero
movabsq $139756740332160, %rbx # imm = 0x7F1BA6DCCA80
vmovsd (%rbx), %xmm12 # xmm12 = mem[0],zero
movabsq $139756740332168, %rbx # imm = 0x7F1BA6DCCA88
vmovsd (%rbx), %xmm13 # xmm13 = mem[0],zero
movabsq $139756740332176, %rbx # imm = 0x7F1BA6DCCA90
vmovsd (%rbx), %xmm14 # xmm14 = mem[0],zero
movabsq $139756740332184, %rbx # imm = 0x7F1BA6DCCA98
vmovsd (%rbx), %xmm15 # xmm15 = mem[0],zero
movabsq $139756740332192, %rbx # imm = 0x7F1BA6DCCAA0
vmovsd (%rbx), %xmm16 # xmm16 = mem[0],zero
movabsq $139756740332200, %rbx # imm = 0x7F1BA6DCCAA8
vmovsd (%rbx), %xmm17 # xmm17 = mem[0],zero
movabsq $139756740332208, %rbx # imm = 0x7F1BA6DCCAB0
vmovsd (%rbx), %xmm18 # xmm18 = mem[0],zero
movabsq $139756740332216, %rbx # imm = 0x7F1BA6DCCAB8
vmovsd (%rbx), %xmm3 # xmm3 = mem[0],zero
movabsq $139756740332224, %rbx # imm = 0x7F1BA6DCCAC0
vmovsd (%rbx), %xmm4 # xmm4 = mem[0],zero
movabsq $139756740332232, %rbx # imm = 0x7F1BA6DCCAC8
vmovsd (%rbx), %xmm5 # xmm5 = mem[0],zero
movabsq $139756740332240, %rbx # imm = 0x7F1BA6DCCAD0
vmovsd (%rbx), %xmm6 # xmm6 = mem[0],zero
nopw %cs:(%rax,%rax)
; β
; β @ REPL[43]:5 within `logsumexp_sleefpirates!'
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:6
; βββ @ array.jl:728 within `getindex'
L1584:
vmovsd (%rcx,%rsi,8), %xmm7 # xmm7 = mem[0],zero
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ float.jl:397
vsubsd %xmm0, %xmm7, %xmm7
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:7
; βββ @ exp.jl:181 within `exp'
; ββββ @ float.jl:399 within `*'
vmulsd %xmm8, %xmm7, %xmm1
; ββββ
; ββββ @ floatfuncs.jl:129 within `round' @ float.jl:370
vrndscalesd $4, %xmm1, %xmm1, %xmm1
; ββββ
; βββ @ float.jl:304 within `exp'
vcvttsd2si %xmm1, %rax
; βββ
; βββ @ exp.jl:185 within `exp'
; ββββ @ float.jl:404 within `muladd'
vfmadd231sd %xmm9, %xmm1, %xmm7
; ββββ
; βββ @ exp.jl:186 within `exp'
; ββββ @ float.jl:404 within `muladd'
vfmadd231sd %xmm10, %xmm1, %xmm7
; ββββ
; βββ @ exp.jl:188 within `exp'
; ββββ @ exp.jl:161 within `exp_kernel'
; βββββ @ math.jl:101 within `macro expansion'
; ββββββ @ float.jl:404 within `muladd'
vmovapd %xmm11, %xmm2
vfmadd213sd %xmm12, %xmm7, %xmm2
vfmadd213sd %xmm13, %xmm7, %xmm2
vfmadd213sd %xmm14, %xmm7, %xmm2
vfmadd213sd %xmm15, %xmm7, %xmm2
vfmadd213sd %xmm16, %xmm7, %xmm2
vfmadd213sd %xmm17, %xmm7, %xmm2
vfmadd213sd %xmm18, %xmm7, %xmm2
vfmadd213sd %xmm3, %xmm7, %xmm2
vfmadd213sd %xmm4, %xmm7, %xmm2
vfmadd213sd %xmm5, %xmm7, %xmm2
; ββββββ
; βββ @ float.jl:399 within `exp'
vmulsd %xmm7, %xmm7, %xmm1
vmulsd %xmm2, %xmm1, %xmm1
; βββ
; βββ @ exp.jl:189 within `exp'
; ββββ @ operators.jl:529 within `+' @ float.jl:395
vaddsd %xmm1, %xmm7, %xmm1
; ββββ
; ββββ @ float.jl:395 within `+'
vaddsd %xmm6, %xmm1, %xmm2
; ββββ
; βββ @ int.jl:437 within `exp'
movq %rax, %rbx
sarq %rbx
; βββ
; βββ @ exp.jl:190 within `exp'
; ββββ @ int.jl:52 within `ldexp2k'
subl %ebx, %eax
; ββββ
; ββββ @ priv.jl:52 within `ldexp2k'
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
shlq $52, %rbx
addq %rdi, %rbx
; βββββββ
; ββββββ @ essentials.jl:417 within `integer2float'
vmovq %rbx, %xmm1
; ββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulsd %xmm1, %xmm2, %xmm2
; ββββββ
; βββββ @ utils.jl:51 within `pow2i'
; ββββββ @ utils.jl:20 within `integer2float'
; βββββββ @ int.jl:446 within `<<' @ int.jl:439
shlq $52, %rax
addq %rdi, %rax
; βββββββ
; βββββββ @ essentials.jl:417 within `reinterpret'
vmovq %rax, %xmm1
; βββββββ
; βββββ @ floating_point_arithmetic.jl:62 within `evmul'
; ββββββ @ float.jl:399 within `*'
vmulsd %xmm1, %xmm2, %xmm1
; ββββββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[43]:8
; βββ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rdx,%rsi,8)
; βββ
; ββ @ simdloop.jl:77 within `macro expansion' @ float.jl:395
vaddsd %xmm1, %xmm19, %xmm19
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:53 within `+'
addq $1, %rsi
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
; βββ @ int.jl:49 within `<'
cmpq %r9, %rsi
; βββ
jb L1584
; ββ
; β @ REPL[43]:11 within `logsumexp_sleefpirates!'
L1757:
movabsq $"<;", %rax
vmovupd %zmm19, 64(%rsp)
vmovapd %xmm19, %xmm0
vzeroupper
callq *%rax
; ββ @ broadcast.jl:801 within `materialize!'
; βββ @ abstractarray.jl:75 within `axes'
; ββββ @ array.jl:155 within `size'
movq 24(%r15), %rdx
; ββββ
; βββ @ promotion.jl:414 within `axes'
testq %rdx, %rdx
; βββ
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; ββββ @ simdloop.jl:72 within `macro expansion'
jle L2039
; ββββ
; β @ simdloop.jl within `logsumexp_sleefpirates!'
movq %rdx, %rax
sarq $63, %rax
andnq %rdx, %rax, %rax
; β
; β @ REPL[43]:11 within `logsumexp_sleefpirates!'
; ββ @ float.jl:395 within `+'
vaddsd 32(%rsp), %xmm0, %xmm0
movq (%r15), %rcx
; ββ
; ββ @ broadcast.jl:801 within `materialize!'
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:886
; ββββ @ broadcast.jl:869 within `preprocess'
; βββββ @ broadcast.jl:872 within `preprocess_args'
; ββββββ @ broadcast.jl:870 within `preprocess'
; βββββββ @ broadcast.jl:592 within `extrude'
; ββββββββ @ broadcast.jl:547 within `newindexer'
; βββββββββ @ broadcast.jl:548 within `shapeindexer'
; ββββββββββ @ broadcast.jl:553 within `_newindexer'
; βββββββββββ @ operators.jl:193 within `!='
; ββββββββββββ @ promotion.jl:403 within `=='
cmpq $1, %rdx
; ββββββββββββ
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; ββββ @ simdloop.jl:75 within `macro expansion'
jne L1867
xorl %edx, %edx
nopw %cs:(%rax,%rax)
; ββββ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; βββββ @ broadcast.jl:558 within `getindex'
; ββββββ @ broadcast.jl:597 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:621 within `_getindex'
; ββββββββ @ broadcast.jl:591 within `_broadcast_getindex'
; βββββββββ @ array.jl:728 within `getindex'
L1840:
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
; βββββββββ
; ββββββ @ broadcast.jl:598 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; ββββββββ @ float.jl:397 within `-'
vsubsd %xmm0, %xmm1, %xmm1
; ββββββββ
; βββββ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rcx,%rdx,8)
; βββββ
; ββββ @ int.jl:53 within `macro expansion'
addq $1, %rdx
; ββββ
; ββββ @ simdloop.jl:75 within `macro expansion'
; βββββ @ int.jl:49 within `<'
cmpq %rax, %rdx
; βββββ
jb L1840
jmp L2039
L1867:
cmpq $32, %rax
jae L1880
xorl %edx, %edx
jmp L2016
; ββββ @ simdloop.jl:75 within `macro expansion'
L1880:
movabsq $9223372036854775776, %rdx # imm = 0x7FFFFFFFFFFFFFE0
andq %rax, %rdx
vbroadcastsd %xmm0, %zmm1
leaq 192(%rcx), %rsi
; ββββ @ simdloop.jl:78 within `macro expansion'
; βββββ @ int.jl:53 within `+'
movq %rdx, %rdi
nopw %cs:(%rax,%rax)
; βββββ
; ββββ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; βββββ @ broadcast.jl:558 within `getindex'
; ββββββ @ broadcast.jl:597 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:621 within `_getindex'
; ββββββββ @ broadcast.jl:591 within `_broadcast_getindex'
; βββββββββ @ array.jl:728 within `getindex'
L1920:
vmovupd -192(%rsi), %zmm2
vmovupd -128(%rsi), %zmm3
vmovupd -64(%rsi), %zmm4
vmovupd (%rsi), %zmm5
; βββββββββ
; ββββββ @ broadcast.jl:598 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; ββββββββ @ float.jl:397 within `-'
vsubpd %zmm1, %zmm2, %zmm2
vsubpd %zmm1, %zmm3, %zmm3
vsubpd %zmm1, %zmm4, %zmm4
vsubpd %zmm1, %zmm5, %zmm5
; ββββββββ
; ββββ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovupd %zmm2, -192(%rsi)
vmovupd %zmm3, -128(%rsi)
vmovupd %zmm4, -64(%rsi)
vmovupd %zmm5, (%rsi)
; ββββ @ simdloop.jl:78 within `macro expansion'
; βββββ @ int.jl:53 within `+'
addq $256, %rsi # imm = 0x100
addq $-32, %rdi
jne L1920
; βββββ
; β @ int.jl within `logsumexp_sleefpirates!'
cmpq %rdx, %rax
; β
; β @ simdloop.jl:75 within `logsumexp_sleefpirates!'
je L2039
; β
; β @ REPL[43]:11 within `logsumexp_sleefpirates!'
; ββ @ broadcast.jl:801 within `materialize!'
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; ββββ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; βββββ @ broadcast.jl:558 within `getindex'
; ββββββ @ broadcast.jl:597 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:621 within `_getindex'
; ββββββββ @ broadcast.jl:591 within `_broadcast_getindex'
; βββββββββ @ array.jl:728 within `getindex'
L2016:
vmovsd (%rcx,%rdx,8), %xmm1 # xmm1 = mem[0],zero
; βββββββββ
; ββββββ @ float.jl:397 within `_broadcast_getindex'
vsubsd %xmm0, %xmm1, %xmm1
; ββββββ
; βββββ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rcx,%rdx,8)
; βββββ
; ββββ @ simdloop.jl:78 within `macro expansion'
; βββββ @ int.jl:53 within `+'
addq $1, %rdx
; βββββ
; ββββ @ simdloop.jl:75 within `macro expansion'
; βββββ @ int.jl:49 within `<'
cmpq %rax, %rdx
jb L2016
; βββββ
; β @ REPL[43]:12 within `logsumexp_sleefpirates!'
; ββ @ broadcast.jl:801 within `materialize!'
; βββ @ abstractarray.jl:75 within `axes'
; ββββ @ array.jl:155 within `size'
L2039:
movq 24(%r14), %rdx
; ββββ
; βββ @ promotion.jl:414 within `axes'
testq %rdx, %rdx
; βββ
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; ββββ @ simdloop.jl:72 within `macro expansion'
jle L2259
; ββββ
; β @ simdloop.jl within `logsumexp_sleefpirates!'
movq %rdx, %rax
sarq $63, %rax
andnq %rdx, %rax, %rax
movabsq $139756740332240, %rcx # imm = 0x7F1BA6DCCAD0
; β
; β @ REPL[43]:12 within `logsumexp_sleefpirates!'
; ββ @ promotion.jl:316 within `/' @ float.jl:401
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
vdivsd 64(%rsp), %xmm0, %xmm0
movq (%r14), %rcx
; ββ
; ββ @ broadcast.jl:801 within `materialize!'
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:886
; ββββ @ broadcast.jl:869 within `preprocess'
; βββββ @ broadcast.jl:872 within `preprocess_args'
; ββββββ @ broadcast.jl:870 within `preprocess'
; βββββββ @ broadcast.jl:592 within `extrude'
; ββββββββ @ broadcast.jl:547 within `newindexer'
; βββββββββ @ broadcast.jl:548 within `shapeindexer'
; ββββββββββ @ broadcast.jl:553 within `_newindexer'
; βββββββββββ @ operators.jl:193 within `!='
; ββββββββββββ @ promotion.jl:403 within `=='
cmpq $1, %rdx
; ββββββββββββ
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; ββββ @ simdloop.jl:75 within `macro expansion'
jne L2119
xorl %edx, %edx
nop
; ββββ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; βββββ @ broadcast.jl:558 within `getindex'
; ββββββ @ broadcast.jl:598 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; ββββββββ @ float.jl:399 within `*'
L2096:
vmulsd (%rcx), %xmm0, %xmm1
; ββββββββ
; βββββ @ array.jl:766 within `setindex!'
vmovsd %xmm1, (%rcx,%rdx,8)
; βββββ
; ββββ @ int.jl:53 within `macro expansion'
addq $1, %rdx
; ββββ
; ββββ @ simdloop.jl:75 within `macro expansion'
; βββββ @ int.jl:49 within `<'
cmpq %rax, %rdx
; βββββ
jb L2096
jmp L2259
L2119:
cmpq $32, %rax
jae L2129
xorl %edx, %edx
jmp L2240
; ββββ @ simdloop.jl:75 within `macro expansion'
L2129:
movabsq $9223372036854775776, %rdx # imm = 0x7FFFFFFFFFFFFFE0
andq %rax, %rdx
vbroadcastsd %xmm0, %zmm1
leaq 192(%rcx), %rsi
; ββββ @ simdloop.jl:78 within `macro expansion'
; βββββ @ int.jl:53 within `+'
movq %rdx, %rdi
nop
; βββββ
; ββββ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; βββββ @ broadcast.jl:558 within `getindex'
; ββββββ @ broadcast.jl:598 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; ββββββββ @ float.jl:399 within `*'
L2160:
vmulpd -192(%rsi), %zmm1, %zmm2
vmulpd -128(%rsi), %zmm1, %zmm3
vmulpd -64(%rsi), %zmm1, %zmm4
vmulpd (%rsi), %zmm1, %zmm5
; ββββββββ
; ββββ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovupd %zmm2, -192(%rsi)
vmovupd %zmm3, -128(%rsi)
vmovupd %zmm4, -64(%rsi)
vmovupd %zmm5, (%rsi)
; ββββ @ simdloop.jl:78 within `macro expansion'
; βββββ @ int.jl:53 within `+'
addq $256, %rsi # imm = 0x100
addq $-32, %rdi
jne L2160
; βββββ
; β @ int.jl within `logsumexp_sleefpirates!'
cmpq %rdx, %rax
; β
; β @ simdloop.jl:75 within `logsumexp_sleefpirates!'
je L2259
nopl (%rax,%rax)
; β
; β @ REPL[43]:12 within `logsumexp_sleefpirates!'
; ββ @ broadcast.jl:801 within `materialize!'
; βββ @ broadcast.jl:842 within `copyto!' @ broadcast.jl:887
; ββββ @ simdloop.jl:77 within `macro expansion' @ broadcast.jl:888
; βββββ @ broadcast.jl:558 within `getindex'
; ββββββ @ broadcast.jl:598 within `_broadcast_getindex'
; βββββββ @ broadcast.jl:625 within `_broadcast_getindex_evalf'
; ββββββββ @ float.jl:399 within `*'
L2240:
vmulsd (%rcx,%rdx,8), %xmm0, %xmm1
; ββββββββ
; ββββ @ simdloop.jl:77 within `macro expansion' @ array.jl:766
vmovsd %xmm1, (%rcx,%rdx,8)
; ββββ @ simdloop.jl:78 within `macro expansion'
; βββββ @ int.jl:53 within `+'
addq $1, %rdx
; βββββ
; ββββ @ simdloop.jl:75 within `macro expansion'
; βββββ @ int.jl:49 within `<'
cmpq %rax, %rdx
jb L2240
; βββββ
L2259:
movq %r14, %rax
addq $224, %rsp
popq %rbx
popq %r14
popq %r15
vzeroupper
retq
; β @ REPL[43]:5 within `logsumexp_sleefpirates!'
; ββ @ simdloop.jl:71 within `macro expansion'
; βββ @ simdloop.jl:51 within `simd_inner_length'
; ββββ @ range.jl:541 within `length'
; βββββ @ checked.jl:166 within `checked_add'
L2278:
movabsq $throw_overflowerr_binaryop, %rax
movabsq $139757138773328, %rdi # imm = 0x7F1BBE9C8550
movl $1, %edx
callq *%rax
ud2
nopw %cs:(%rax,%rax)
; βββββ
SLEEFPirates is vectorized.