Well spotted! It appears that people were under the misconception that LLVM would figure that out (I am surprised by that failure as well).
So we need an extra method for all signed bitintegers (Int8,Int16,Int32,Int64,Int128). Note
julia> @code_llvm sign(1)
; Function Signature: sign(Int64)
; @ number.jl:162 within `sign`
define i64 @julia_sign_5068(i64 signext %"x::Int64") #0 {
top:
; β @ essentials.jl:797 within `ifelse`
%0 = call i64 @llvm.smin.i64(i64 %"x::Int64", i64 1)
%1 = call i64 @llvm.smax.i64(i64 %0, i64 -1)
ret i64 %1
; β
}
This does not make it obvious that the resulting machine code sucks on intel.
The other code is
julia> @code_llvm _sign(1)
; Function Signature: _sign(Int64)
; @ REPL[3]:1 within `_sign`
define i64 @julia__sign_5070(i64 signext %"x::Int64") #0 {
top:
; β @ bool.jl:167 within `-`
; ββ @ boot.jl:954 within `Int64`
; βββ @ boot.jl:881 within `toInt64`
%"x::Int64.lobit.neg" = ashr i64 %"x::Int64", 63
; βββ
; β @ bool.jl:167 within `-` @ int.jl:86
%isnotnull = icmp ne i64 %"x::Int64", 0
%isnotnull.zext = zext i1 %isnotnull to i64
%0 = or i64 %"x::Int64.lobit.neg", %isnotnull.zext
ret i64 %0
; β
julia> @code_native _sign(1)
.text
.file "_sign"
.section .ltext,"axl",@progbits
.globl julia__sign_5072 # -- Begin function julia__sign_5072
.p2align 4, 0x90
.type julia__sign_5072,@function
julia__sign_5072: # @julia__sign_5072
; Function Signature: _sign(Int64)
; β @ REPL[3]:1 within `_sign`
# %bb.0: # %top
#DEBUG_VALUE: _sign:x <- $rdi
push rbp
mov rbp, rsp
; ββ @ bool.jl:167 within `-`
; βββ @ boot.jl:954 within `Int64`
; ββββ @ boot.jl:881 within `toInt64`
mov rcx, rdi
sar rcx, 63
; ββββ
; ββ @ bool.jl:167 within `-` @ int.jl:86
xor eax, eax
test rdi, rdi
setne al
or rax, rcx
pop rbp
ret
julia> @code_native sign(1)
.text
.file "sign"
.section .ltext,"axl",@progbits
.globl julia_sign_5074 # -- Begin function julia_sign_5074
.p2align 4, 0x90
.type julia_sign_5074,@function
julia_sign_5074: # @julia_sign_5074
; Function Signature: sign(Int64)
; β @ number.jl:162 within `sign`
# %bb.0: # %top
#DEBUG_VALUE: sign:x <- $rdi
push rbp
mov rbp, rsp
; ββ @ essentials.jl:797 within `ifelse`
test rdi, rdi
mov ecx, 1
cmovle rcx, rdi
test rcx, rcx
mov rax, -1
cmovns rax, rcx
pop rbp
ret
.Lfunc_end0:
.size julia_sign_5074, .Lfunc_end0-julia_sign_5074
; ββ
# -- End function
.section ".note.GNU-stack","",@progbits
PS. I guess the critical part here is how this looks inside of a tight simd loop (which you are benchmarking with map!
). The hot loop part for your fast version is
.LBB0_18: # %vector.body
# =>This Inner Loop Header: Depth=1
; ββββ @ simdloop.jl:77 within `macro expansion` @ broadcast.jl:995
; βββββ @ broadcast.jl:616 within `getindex`
; ββββββ @ broadcast.jl:620 within `_getindex`
; βββββββ @ broadcast.jl:671 within `_broadcast_getindex`
; ββββββββ @ broadcast.jl:696 within `_getindex`
; βββββββββ @ broadcast.jl:665 within `_broadcast_getindex`
; ββββββββββ @ essentials.jl:918 within `getindex`
vmovdqu ymm2, ymmword ptr [rax + 8*rdx]
vmovdqu ymm3, ymmword ptr [rax + 8*rdx + 32]
vmovdqu ymm4, ymmword ptr [rax + 8*rdx + 64]
vmovdqu ymm5, ymmword ptr [rax + 8*rdx + 96]
; ββββββββββ
; βββββββ @ broadcast.jl:672 within `_broadcast_getindex`
; ββββββββ @ broadcast.jl:699 within `_broadcast_getindex_evalf`
; βββββββββ @ REPL[3]:1 within `_sign`
; ββββββββββ @ bool.jl:167 within `-`
; βββββββββββ @ boot.jl:954 within `Int64`
; ββββββββββββ @ boot.jl:881 within `toInt64`
vpcmpgtq ymm6, ymm0, ymm2
vpcmpgtq ymm7, ymm0, ymm3
vpcmpgtq ymm8, ymm0, ymm4
vpcmpgtq ymm9, ymm0, ymm5
; ββββββββββββ
; ββββββββββ @ bool.jl:167 within `-` @ int.jl:86
vpcmpeqq ymm2, ymm2, ymm0
vpandn ymm2, ymm2, ymm1
vpor ymm2, ymm6, ymm2
vpcmpeqq ymm3, ymm3, ymm0
vpandn ymm3, ymm3, ymm1
vpor ymm3, ymm7, ymm3
vpcmpeqq ymm4, ymm4, ymm0
vpandn ymm4, ymm4, ymm1
vpor ymm4, ymm8, ymm4
vpcmpeqq ymm5, ymm5, ymm0
vpandn ymm5, ymm5, ymm1
vpor ymm5, ymm9, ymm5
; ββββββββββ
; βββββ @ array.jl:986 within `setindex!`
; ββββββ @ array.jl:991 within `_setindex!`
vmovdqu ymmword ptr [rcx + 8*rdx], ymm2
vmovdqu ymmword ptr [rcx + 8*rdx + 32], ymm3
vmovdqu ymmword ptr [rcx + 8*rdx + 64], ymm4
vmovdqu ymmword ptr [rcx + 8*rdx + 96], ymm5
; ββββββ
; ββββ @ simdloop.jl:78 within `macro expansion`
; βββββ @ int.jl:87 within `+`
add rdx, 16
cmp rsi, rdx
jne .LBB0_18
while the slower base version has
.LBB0_18: # %vector.body
# =>This Inner Loop Header: Depth=1
; ββββ @ simdloop.jl:77 within `macro expansion` @ broadcast.jl:995
; βββββ @ broadcast.jl:616 within `getindex`
; ββββββ @ broadcast.jl:620 within `_getindex`
; βββββββ @ broadcast.jl:671 within `_broadcast_getindex`
; ββββββββ @ broadcast.jl:696 within `_getindex`
; βββββββββ @ broadcast.jl:665 within `_broadcast_getindex`
; ββββββββββ @ essentials.jl:918 within `getindex`
vmovdqu ymm2, ymmword ptr [rax + 8*rsi]
vmovdqu ymm3, ymmword ptr [rax + 8*rsi + 32]
vmovdqu ymm4, ymmword ptr [rax + 8*rsi + 64]
vmovdqu ymm5, ymmword ptr [rax + 8*rsi + 96]
; ββββββββββ
; βββββββ @ broadcast.jl:672 within `_broadcast_getindex`
; ββββββββ @ broadcast.jl:699 within `_broadcast_getindex_evalf`
; βββββββββ @ number.jl:162 within `sign`
; ββββββββββ @ essentials.jl:797 within `ifelse`
vpcmpgtq ymm6, ymm0, ymm2
vblendvpd ymm2, ymm0, ymm2, ymm6
vpcmpgtq ymm6, ymm0, ymm3
vblendvpd ymm3, ymm0, ymm3, ymm6
vpcmpgtq ymm6, ymm0, ymm4
vblendvpd ymm4, ymm0, ymm4, ymm6
vpcmpgtq ymm6, ymm0, ymm5
vblendvpd ymm5, ymm0, ymm5, ymm6
vpcmpgtq ymm6, ymm1, ymm2
vpor ymm2, ymm6, ymm2
vpcmpgtq ymm6, ymm1, ymm3
vpor ymm3, ymm6, ymm3
vpcmpgtq ymm6, ymm1, ymm4
vpor ymm4, ymm6, ymm4
vpcmpgtq ymm6, ymm1, ymm5
vpor ymm5, ymm6, ymm5
; ββββββββββ
; βββββ @ array.jl:986 within `setindex!`
; ββββββ @ array.jl:991 within `_setindex!`
vmovdqu ymmword ptr [rcx + 8*rsi], ymm2
vmovdqu ymmword ptr [rcx + 8*rsi + 32], ymm3
vmovdqu ymmword ptr [rcx + 8*rsi + 64], ymm4
vmovdqu ymmword ptr [rcx + 8*rsi + 96], ymm5
; ββββββ
; ββββ @ simdloop.jl:78 within `macro expansion`
; βββββ @ int.jl:87 within `+`
add rsi, 16
cmp rdx, rsi
jne .LBB0_18