They don’t detail contents that beat counting sort, I think they detail how to optimize radix sort to beat a basic application of radix sort. And they have some tricks to deal with partially sorted data better. I have read them as part of my research; again radix sort is nothing but applying counting sort multiple times; each time it iterates through all the elements and assign them to a new location; they can’t use counting sort in one go for 64 bit types as there are 2^64 possible unique values, so they iterate through all the elements using 8 bits (or 11 bits) at a time so each time there are only 2^8 (2^11) unique values to count.
In this particular case, there are only 1_000_000
unique values, and it’s known in advance, hence a straight counting sort will beat radix sort, if things like cache-performance and generated code don’t come into the picture. Theoretically, counting sort will always be faster, for the simple reason that it only needs one pass over the data verses, radix sort’s 64/8 or (64/11) passes, and that’s why Base uses counting sort in sort_int_range!
which is demonstrably faster than R’s radix sort as shown in the original post. It’s just that sortperm
that has poor performance but it’ using the same underlying “counting sort”-like algorithm.
Getting back into the real world, in practice, things like cache (which I don’t understand very well) and other CPU and other hardware related things matter, the low-level code being generated also matters. That’s why I am leaning towards checking out the generated code for any issues. This might even help towards closing the issue that was opened 5-6 years ago regarding poor sortperm
performance.
function sortperm_int_range2(a, rangelen, minval)
cnts = fill(0, rangelen)
# create a count first
@inbounds for ai in a
cnts[ai - minval + 1] += 1
end
# create cumulative sum
cumsum!(cnts, cnts)
la = length(a)
res = Vector{Int}(la)
@inbounds for i in la:-1:1
ai = a[i] - minval + 1
c = cnts[ai]
cnts[ai] -= 1
res[c] = i
end
res
end
@code_lowered @code_typed @code_llvm @code_native
Main> @code_lowered sortperm_int_range2(a, 1_000_000, 1)
CodeInfo(:(begin
nothing
NewvarNode(:(la))
NewvarNode(:(res))
cnts = (Main.fill)(0, rangelen) # line 4:
$(Expr(:inbounds, true))
SSAValue(0) = a
#temp#@_6 = (Base.start)(SSAValue(0))
9:
unless !((Base.done)(SSAValue(0), #temp#@_6)) goto 20
SSAValue(1) = (Base.next)(SSAValue(0), #temp#@_6)
ai@_5 = (Core.getfield)(SSAValue(1), 1)
#temp#@_6 = (Core.getfield)(SSAValue(1), 2) # line 5:
SSAValue(2) = (ai@_5 - minval) + 1
SSAValue(3) = (Main.getindex)(cnts, SSAValue(2)) + 1
(Main.setindex!)(cnts, SSAValue(3), SSAValue(2))
18:
goto 9
20:
$(Expr(:inbounds, :pop)) # line 8:
(Main.cumsum!)(cnts, cnts) # line 9:
la = (Main.length)(a) # line 10:
res = ((Core.apply_type)(Main.Vector, Main.Int))(la) # line 11:
$(Expr(:inbounds, true))
SSAValue(4) = (Main.colon)(la, -1, 1)
#temp#@_10 = (Base.start)(SSAValue(4))
32:
unless !((Base.done)(SSAValue(4), #temp#@_10)) goto 48
SSAValue(5) = (Base.next)(SSAValue(4), #temp#@_10)
i = (Core.getfield)(SSAValue(5), 1)
#temp#@_10 = (Core.getfield)(SSAValue(5), 2) # line 12:
ai@_9 = ((Main.getindex)(a, i) - minval) + 1 # line 13:
c = (Main.getindex)(cnts, ai@_9) # line 14:
SSAValue(6) = (Main.getindex)(cnts, ai@_9) - 1
(Main.setindex!)(cnts, SSAValue(6), ai@_9) # line 15:
(Main.setindex!)(res, i, c)
46:
goto 32
48:
$(Expr(:inbounds, :pop)) # line 17:
return res
end))
Main> @code_typed sortperm_int_range2(a, 1_000_000, 1)
CodeInfo(:(begin
NewvarNode(:(la))
NewvarNode(:(res))
cnts = $(Expr(:invoke, MethodInstance for fill!(::Array{Int64,1}, ::Int64), :(Base.fill!), :($(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Int64,1}, svec(Any, Int64), Array{Int64,1}, 0, :(rangelen), 0))), 0)) # line 4:
$(Expr(:inbounds, true))
#temp#@_6 = 1
7:
unless (Base.not_int)((#temp#@_6 === (Base.add_int)((Base.arraylen)(a)::Int64, 1)::Int64)::Bool)::Bool goto 19
SSAValue(10) = (Base.arrayref)(a, #temp#@_6)::Int64
SSAValue(11) = (Base.add_int)(#temp#@_6, 1)::Int64
ai@_5 = SSAValue(10)
#temp#@_6 = SSAValue(11) # line 5:
SSAValue(2) = (Base.add_int)((Base.sub_int)(ai@_5, minval)::Int64, 1)::Int64
SSAValue(3) = (Base.add_int)((Base.arrayref)(cnts, SSAValue(2))::Int64, 1)::Int64
(Base.arrayset)(cnts, SSAValue(3), SSAValue(2))::Array{Int64,1}
17:
goto 7
19:
$(Expr(:inbounds, :pop)) # line 8:
$(Expr(:invoke, MethodInstance for cumsum!(::Array{Int64,1}, ::Array{Int64,1}), :(Main.cumsum!), :(cnts), :(cnts))) # line 9:
la = (Base.arraylen)(a)::Int64 # line 10:
res = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Int64,1}, svec(Any, Int64), Array{Int64,1}, 0, :(la), 0)) # line 11:
$(Expr(:inbounds, true))
$(Expr(:inbounds, false))
# meta: location range.jl colon 30
$(Expr(:inbounds, false))
# meta: location range.jl _colon 32
# meta: location range.jl Type 145
# meta: location range.jl Type 93
SSAValue(9) = $(Expr(:invoke, MethodInstance for steprange_last(::Int64, ::Int64, ::Int64), :(Base.steprange_last), :(la), -1, 1))
# meta: pop location
# meta: pop location
# meta: pop location
$(Expr(:inbounds, :pop))
# meta: pop location
$(Expr(:inbounds, :pop))
SSAValue(12) = la
#temp#@_10 = SSAValue(12)
44:
unless (Base.not_int)((Base.or_int)((Base.and_int)((Base.not_int)((SSAValue(12) === SSAValue(9))::Bool)::Bool, (Base.not_int)(((Base.slt_int)(0, -1)::Bool === (Base.slt_int)(SSAValue(12), SSAValue(9))::Bool)::Bool)::Bool)::Bool, (#temp#@_10 === (Base.add_int)(SSAValue(9), -1)::Int64)::Bool)::Bool)::Bool goto 61
SSAValue(13) = #temp#@_10
SSAValue(14) = (Base.add_int)(#temp#@_10, -1)::Int64
i = SSAValue(13)
#temp#@_10 = SSAValue(14) # line 12:
ai@_9 = (Base.add_int)((Base.sub_int)((Base.arrayref)(a, i)::Int64, minval)::Int64, 1)::Int64 # line 13:
c = (Base.arrayref)(cnts, ai@_9)::Int64 # line 14:
SSAValue(6) = (Base.sub_int)((Base.arrayref)(cnts, ai@_9)::Int64, 1)::Int64
(Base.arrayset)(cnts, SSAValue(6), ai@_9)::Array{Int64,1} # line 15:
(Base.arrayset)(res, i, c)::Array{Int64,1}
59:
goto 44
61:
$(Expr(:inbounds, :pop)) # line 17:
return res
end))=>Array{Int64,1}
Main> @code_llvm sortperm_int_range2(a, 1_000_000, 1)
; Function Attrs: uwtable
define i8** @julia_sortperm_int_range2_64498(i8** dereferenceable(40), i64, i64) #0 !dbg !5 {
top:
%3 = call i8**** @jl_get_ptls_states() #4
%4 = alloca [13 x i8**], align 8
%.sub = getelementptr inbounds [13 x i8**], [13 x i8**]* %4, i64 0, i64 0
%5 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 2
%6 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 3
%7 = bitcast [13 x i8**]* %4 to i64*
%8 = bitcast i8*** %5 to i8*
call void @llvm.memset.p0i8.i64(i8* %8, i8 0, i64 88, i32 8, i1 false)
store i64 22, i64* %7, align 8
%9 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 1
%10 = bitcast i8**** %3 to i64*
%11 = load i64, i64* %10, align 8
%12 = bitcast i8*** %9 to i64*
store i64 %11, i64* %12, align 8
store i8*** %.sub, i8**** %3, align 8
%13 = call i8** inttoptr (i64 1801121808 to i8** (i8**, i64)*)(i8** inttoptr (i64 162829584 to i8**), i64 %1)
store i8** %13, i8*** %5, align 8
%14 = call i8** @"jlsys_fill!_42240"(i8** %13, i64 0)
store i8** %14, i8*** %6, align 8
%15 = getelementptr inbounds i8*, i8** %0, i64 1
%16 = bitcast i8** %15 to i64*
%17 = load i64, i64* %16, align 8
%18 = icmp eq i64 %17, 0
br i1 %18, label %L19, label %if.lr.ph
if.lr.ph: ; preds = %top
%19 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 4
%20 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 5
%21 = bitcast i8** %0 to i64**
%22 = load i64*, i64** %21, align 8
%23 = bitcast i8** %14 to i64**
%24 = load i64*, i64** %23, align 8
br label %if
if: ; preds = %if.lr.ph, %if
%"#temp#.07" = phi i64 [ 1, %if.lr.ph ], [ %28, %if ]
%25 = add i64 %"#temp#.07", -1
%26 = getelementptr i64, i64* %22, i64 %25
%27 = load i64, i64* %26, align 8
%28 = add i64 %"#temp#.07", 1
%29 = sub i64 %27, %2
%30 = getelementptr i64, i64* %24, i64 %29
%31 = load i64, i64* %30, align 8
%32 = add i64 %31, 1
store i64 %32, i64* %30, align 8
%33 = icmp eq i64 %"#temp#.07", %17
br i1 %33, label %L7.L19_crit_edge, label %if
L7.L19_crit_edge: ; preds = %if
store i8** %14, i8*** %19, align 8
store i8** %14, i8*** %20, align 8
br label %L19
L19: ; preds = %L7.L19_crit_edge, %top
%34 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 6
%35 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 7
%36 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 8
store i8** %14, i8*** %34, align 8
store i8** %14, i8*** %35, align 8
%37 = call i8** @"julia_cumsum!_64499"(i8** %14, i8** %14)
%38 = load i64, i64* %16, align 8
%39 = call i8** inttoptr (i64 1801121808 to i8** (i8**, i64)*)(i8** inttoptr (i64 162829584 to i8**), i64 %38)
store i8** %39, i8*** %36, align 8
%40 = call i64 @jlsys_steprange_last_58768(i64 %38, i64 -1, i64 1)
%41 = icmp slt i64 %38, %40
%42 = zext i1 %41 to i8
%43 = add i64 %40, -1
%44 = icmp eq i64 %38, %43
%45 = zext i1 %44 to i8
%46 = or i8 %45, %42
%47 = icmp eq i8 %46, 1
br i1 %47, label %L61, label %if3.lr.ph
if3.lr.ph: ; preds = %L19
%48 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 9
%49 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 10
%50 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 11
%51 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 12
%52 = bitcast i8** %0 to i64**
%53 = load i64*, i64** %52, align 8
%54 = bitcast i8** %14 to i64**
%55 = load i64*, i64** %54, align 8
%56 = bitcast i8** %39 to i64**
%57 = load i64*, i64** %56, align 8
br label %if3
if3: ; preds = %if3.lr.ph, %if3
%"#temp#2.06" = phi i64 [ %38, %if3.lr.ph ], [ %58, %if3 ]
%58 = add i64 %"#temp#2.06", -1
%59 = getelementptr i64, i64* %53, i64 %58
%60 = load i64, i64* %59, align 8
%61 = sub i64 %60, %2
%62 = getelementptr i64, i64* %55, i64 %61
%63 = load i64, i64* %62, align 8
%64 = add i64 %63, -1
store i64 %64, i64* %62, align 8
%65 = getelementptr i64, i64* %57, i64 %64
store i64 %"#temp#2.06", i64* %65, align 8
%66 = icmp eq i64 %58, %43
%67 = zext i1 %66 to i8
%68 = or i8 %67, %42
%69 = icmp eq i8 %68, 1
br i1 %69, label %L44.L61_crit_edge, label %if3
L44.L61_crit_edge: ; preds = %if3
store i8** %14, i8*** %48, align 8
store i8** %14, i8*** %49, align 8
store i8** %14, i8*** %50, align 8
store i8** %39, i8*** %51, align 8
br label %L61
L61: ; preds = %L44.L61_crit_edge, %L19
%70 = load i64, i64* %12, align 8
store i64 %70, i64* %10, align 8
ret i8** %39
}
Main> @code_native sortperm_int_range2(a, 1_000_000, 1)
.text
Filename: none
pushq %rbp
movq %rsp, %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rsi
pushq %rdi
pushq %rbx
subq $152, %rsp
movq %r8, %r14
movq %rdx, %rsi
movq %rcx, %r15
movabsq $jl_get_ptls_states, %rax
callq *%rax
xorps %xmm0, %xmm0
movups %xmm0, -88(%rbp)
movups %xmm0, -104(%rbp)
movups %xmm0, -120(%rbp)
movups %xmm0, -136(%rbp)
movups %xmm0, -152(%rbp)
movq $0, -72(%rbp)
movq $22, -168(%rbp)
movq (%rax), %rcx
movq %rcx, -160(%rbp)
leaq -168(%rbp), %rcx
movq %rax, -64(%rbp)
movq %rcx, (%rax)
Source line: 2
movl $jl_alloc_array_1d, %eax
movl $162829584, %ecx # imm = 0x9B49510
movq %rsi, %rdx
callq *%rax
movq %rax, -152(%rbp)
movabsq $"fill!", %rbx
xorl %edx, %edx
movq %rax, %rcx
callq *%rbx
movq %rax, %r13
movq %r13, -144(%rbp)
Source line: 4
movq 8(%r15), %rax
testq %rax, %rax
je L222
movq (%r15), %rcx
Source line: 5
movq (%r13), %rdx
nopw %cs:(%rax,%rax)
Source line: 4
L192:
movq (%rcx), %rbx
Source line: 5
subq %r14, %rbx
incq jl_system_image_data(%rdx,%rbx,8)
Source line: 4
addq $8, %rcx
decq %rax
jne L192
Source line: 5
movq %r13, -136(%rbp)
movq %r13, -128(%rbp)
Source line: 8
L222:
movq %r13, -120(%rbp)
movq %r13, -112(%rbp)
movabsq $"cumsum!", %rax
movq %r13, %rcx
movq %r13, %rdx
callq *%rax
Source line: 9
movq 8(%r15), %rbx
Source line: 10
movl $jl_alloc_array_1d, %eax
movl $162829584, %ecx # imm = 0x9B49510
movq %rbx, %rdx
callq *%rax
movq %rax, %r12
movq %r12, -104(%rbp)
Source line: 93
movabsq $steprange_last, %rax
movq $-1, %rdx
movl $1, %r8d
movq %rbx, %rcx
callq *%rax
Source line: 11
cmpq %rax, %rbx
setl %r9b
setge %r8b
leaq -1(%rax), %rdi
cmpq %rdi, %rbx
setne %dl
andb %r8b, %dl
cmpb $1, %dl
jne L411
Source line: 12
movq (%r15), %r8
Source line: 13
movq (%r13), %rsi
Source line: 15
movq (%r12), %r10
nopw %cs:(%rax,%rax)
Source line: 12
L352:
movq -8(%r8,%rbx,8), %rcx
subq %r14, %rcx
Source line: 13
movq (%rsi,%rcx,8), %rdx
Source line: 14
leaq -1(%rdx), %rdi
movq %rdi, (%rsi,%rcx,8)
Source line: 15
movq %rbx, -8(%r10,%rdx,8)
Source line: 11
cmpq %rbx, %rax
leaq -1(%rbx), %rbx
sete %cl
orb %r9b, %cl
cmpb $1, %cl
jne L352
Source line: 13
movq %r13, -96(%rbp)
Source line: 14
movq %r13, -88(%rbp)
movq %r13, -80(%rbp)
Source line: 15
movq %r12, -72(%rbp)
Source line: 17
L411:
movq -160(%rbp), %rax
movq -64(%rbp), %rcx
movq %rax, (%rcx)
movq %r12, %rax
addq $152, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq