Note that IIUC, LLVM will efficiently vectorize unsafe_roundtest!
above: looking for VCVTPS2DQ
instructions on ymm
registers (this is on an AVX2 machine), it looks like the loop was both vectorized and unrolled twice:
x = rand(Float32, 10)
out = similar(x, Int16)
@code_native unsafe_roundtest!(out, x)
output
.text
; β @ essai.jl:8 within `unsafe_roundtest!'
movq %rsi, -8(%rsp)
movq 8(%rsi), %rcx
; ββ @ simdloop.jl:69 within `macro expansion'
; βββ @ abstractarray.jl:212 within `eachindex'
; ββββ @ abstractarray.jl:95 within `axes1'
; βββββ @ abstractarray.jl:75 within `axes'
; ββββββ @ array.jl:155 within `size'
movq 24(%rcx), %rax
; ββββββ
; ββββββ @ tuple.jl:157 within `map'
; βββββββ @ range.jl:320 within `OneTo' @ range.jl:311
; ββββββββ @ promotion.jl:409 within `max'
testq %rax, %rax
; ββββββββ
; ββ @ simdloop.jl:72 within `macro expansion'
jle L226
movq %rax, %rdx
sarq $63, %rdx
andnq %rax, %rdx, %rax
movq (%rsi), %rdx
movq (%rcx), %rcx
movq (%rdx), %rdx
; ββ @ simdloop.jl:75 within `macro expansion'
cmpq $32, %rax
jae L56
xorl %esi, %esi
jmp L208
; ββ @ simdloop.jl:75 within `macro expansion'
L56:
leaq (%rcx,%rax,4), %rsi
cmpq %rsi, %rdx
jae L81
leaq (%rdx,%rax,2), %rsi
; ββ @ simdloop.jl:75 within `macro expansion'
cmpq %rsi, %rcx
jae L81
xorl %esi, %esi
jmp L208
L81:
movabsq $9223372036854775776, %rsi # imm = 0x7FFFFFFFFFFFFFE0
; ββ @ simdloop.jl:75 within `macro expansion'
andq %rax, %rsi
xorl %edi, %edi
; ββ @ simdloop.jl:77 within `macro expansion' @ essai.jl:9
; βββ @ float.jl:309 within `unsafe_trunc'
L96:
vcvttps2dq (%rcx,%rdi,4), %ymm0
vextracti128 $1, %ymm0, %xmm1
vpackssdw %xmm1, %xmm0, %xmm0
vcvttps2dq 32(%rcx,%rdi,4), %ymm1
vextracti128 $1, %ymm1, %xmm2
vpackssdw %xmm2, %xmm1, %xmm1
vcvttps2dq 64(%rcx,%rdi,4), %ymm2
vextracti128 $1, %ymm2, %xmm3
vpackssdw %xmm3, %xmm2, %xmm2
vcvttps2dq 96(%rcx,%rdi,4), %ymm3
vextracti128 $1, %ymm3, %xmm4
vpackssdw %xmm4, %xmm3, %xmm3
; βββ
; βββ @ array.jl:826 within `setindex!'
vmovdqu %xmm0, (%rdx,%rdi,2)
vmovdqu %xmm1, 16(%rdx,%rdi,2)
vmovdqu %xmm2, 32(%rdx,%rdi,2)
vmovdqu %xmm3, 48(%rdx,%rdi,2)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:53 within `+'
addq $32, %rdi
cmpq %rdi, %rsi
jne L96
; βββ
; ββ @ int.jl within `macro expansion'
cmpq %rsi, %rax
; ββ
; ββ @ simdloop.jl:75 within `macro expansion'
je L226
nopw %cs:(%rax,%rax)
nop
; ββ @ simdloop.jl:77 within `macro expansion' @ essai.jl:9
; βββ @ float.jl:309 within `unsafe_trunc'
L208:
vcvttss2si (%rcx,%rsi,4), %edi
; βββ
; βββ @ array.jl:826 within `setindex!'
movw %di, (%rdx,%rsi,2)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:53 within `+'
addq $1, %rsi
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
; βββ @ int.jl:49 within `<'
cmpq %rax, %rsi
; βββ
jb L208
; ββ
; β @ simdloop.jl within `unsafe_roundtest!'
L226:
movabsq $jl_system_image_data, %rax
; β
; β @ essai.jl:8 within `unsafe_roundtest!'
vzeroupper
retq
; β
(I guess all Iβm saying here is that I wouldnβt know how to get faster than that. But here on discourse, you never know: someone might very well prove me wrong in the next post )