Thanks to @kristoffer.carlsson I fixed the immediate problem and can make my PR better (more general); the possible “better compiler” part below still stands; Julia could have done this for the code as is.
In my PR for better factorial, Int64 case is good, but it’s worse for UInt64:
julia> @inline function factorial_lookup(n::Integer, table, lim)
if false # !(0 <= n <= 20) # disabled for minimal example
return factorial_lookup_helper(n, table, lim)
else
@inbounds f = _shifted_fact_table64[oftype(Int64, n)+1] # was _shifted_fact_table64[n+1]
return oftype(n, f)
end
end
factorial_lookup (generic function with 1 method)
@inline factorial_new(n::Union{Int64,UInt64}) = factorial_lookup(n, _fact_table64, 20)
[The “between check” (now disabled) doesn’t help, but with a better compiler it should, as then no possibility of overflow.]
@code_native factorial_new(20) is at the bottom but for UInt I get more complex:
[this is for the line with oftype (better way to do this? reinterpret instead of oftype isn’t it. There’s the @checked macro, but I want basically the opposite @nocheck I’ve not found):
julia> @code_native factorial_new(UInt(20))
.text
; ┌ @ REPL[220]:1 within `factorial_new'
subq $56, %rsp
xorps %xmm0, %xmm0
movaps %xmm0, (%rsp)
movq $0, 16(%rsp)
movq %fs:0, %rax
; │┌ @ REPL[248]:5 within `factorial_lookup'
movq $2, (%rsp)
movq -15552(%rax), %rcx
movq %rcx, 8(%rsp)
movq %rsp, %rcx
movq %rcx, -15552(%rax)
movabsq $jl_box_uint64, %rax
callq *%rax
movq %rax, 16(%rsp)
movabsq $jl_system_image_data, %rcx
movq %rcx, 32(%rsp)
movabsq $139784872953840, %rcx # imm = 0x7F2233B287F0
movq %rcx, 40(%rsp)
movq %rax, 48(%rsp)
movabsq $jl_invoke, %rax
movabsq $139784880894096, %rdi # imm = 0x7F22342BB090
leaq 32(%rsp), %rsi
movl $3, %edx
callq *%rax
ud2
nopw %cs:(%rax,%rax)
; └└
Without oftype worse:
julia> @code_native factorial_new(UInt(20))
.text
; ┌ @ REPL[220]:1 within `factorial_new'
; │┌ @ REPL[244]:5 within `factorial_lookup'
; ││┌ @ multidimensional.jl:458 within `getindex' @ REPL[220]:1
pushq %rax
movabsq $139784873942800, %rax # imm = 0x7F2233C19F10
movq (%rax), %rax
movq (%rax,%rdi,8), %rax
; │└└
; │┌ @ boot.jl:587 within `factorial_lookup'
testq %rax, %rax
; │└
; │┌ @ REPL[244]:6 within `factorial_lookup'
; ││┌ @ essentials.jl:334 within `oftype'
; │││┌ @ number.jl:7 within `convert'
; ││││┌ @ boot.jl:738 within `Type'
; │││││┌ @ boot.jl:708 within `toUInt64'
; ││││││┌ @ boot.jl:597 within `check_top_bit'
js L25
; │└└└└└└
popq %rcx
retq
; │┌ @ REPL[244]:6 within `factorial_lookup'
; ││┌ @ essentials.jl:334 within `oftype'
; │││┌ @ number.jl:7 within `convert'
; ││││┌ @ boot.jl:738 within `Type'
; │││││┌ @ boot.jl:708 within `toUInt64'
; ││││││┌ @ boot.jl:597 within `check_top_bit'
L25:
movabsq $throw_inexacterror, %rcx
movabsq $139784870391424, %rdi # imm = 0x7F22338B6E80
movabsq $139784872953840, %rsi # imm = 0x7F2233B287F0
movq %rax, %rdx
callq *%rcx
ud2
nop
; └└└└└└└
For Int64 ok (without oftype):
julia> @code_native factorial_new(20)
.text
; ┌ @ REPL[221]:1 within `factorial_new'
; │┌ @ REPL[244]:5 within `factorial_lookup'
; ││┌ @ REPL[221]:1 within `getindex'
movabsq $139784873942800, %rax # imm = 0x7F2233C19F10
movq (%rax), %rax
movq (%rax,%rdi,8), %rax
; │└└
retq
nopw %cs:(%rax,%rax)
; └