They don’t detail contents that beat counting sort, I think they detail how to optimize radix sort to beat a basic application of radix sort. And they have some tricks to deal with partially sorted data better. I have read them as part of my research; again radix sort is nothing but applying counting sort multiple times; each time it iterates through all the elements and assign them to a new location; they can’t use counting sort in one go for 64 bit types as there are 2^64 possible unique values, so they iterate through all the elements using 8 bits (or 11 bits) at a time so each time there are only 2^8 (2^11) unique values to count.
In this particular case, there are only 1_000_000 unique values, and it’s known in advance, hence a straight counting sort will beat radix sort, if things like cache-performance and generated code don’t come into the picture. Theoretically, counting sort will always be faster, for the simple reason that it only needs one pass over the data verses, radix sort’s 64/8 or (64/11) passes, and that’s why Base uses counting sort in sort_int_range! which is demonstrably faster than R’s radix sort as shown in the original post. It’s just that sortperm that has poor performance but it’ using the same underlying “counting sort”-like algorithm.
Getting back into the real world, in practice, things like cache (which I don’t understand very well) and other CPU and other hardware related things matter, the low-level code being generated also matters. That’s why I am leaning towards checking out the generated code for any issues. This might even help towards closing the issue that was opened 5-6 years ago regarding poor sortperm performance.
function sortperm_int_range2(a, rangelen, minval)
    cnts = fill(0, rangelen)
    # create a count first
    @inbounds for ai in a
        cnts[ai - minval + 1] +=  1
    end
    # create cumulative sum
    cumsum!(cnts, cnts)
    la = length(a)
    res = Vector{Int}(la)
    @inbounds for i in la:-1:1
        ai = a[i] - minval + 1
        c = cnts[ai]
        cnts[ai] -= 1
        res[c] = i
    end
    res
end
@code_lowered @code_typed @code_llvm @code_native
Main> @code_lowered sortperm_int_range2(a, 1_000_000, 1)
CodeInfo(:(begin
        nothing
        NewvarNode(:(la))
        NewvarNode(:(res))
        cnts = (Main.fill)(0, rangelen) # line 4:
        $(Expr(:inbounds, true))
        SSAValue(0) = a
        #temp#@_6 = (Base.start)(SSAValue(0))
        9:
        unless !((Base.done)(SSAValue(0), #temp#@_6)) goto 20
        SSAValue(1) = (Base.next)(SSAValue(0), #temp#@_6)
        ai@_5 = (Core.getfield)(SSAValue(1), 1)
        #temp#@_6 = (Core.getfield)(SSAValue(1), 2) # line 5:
        SSAValue(2) = (ai@_5 - minval) + 1
        SSAValue(3) = (Main.getindex)(cnts, SSAValue(2)) + 1
        (Main.setindex!)(cnts, SSAValue(3), SSAValue(2))
        18:
        goto 9
        20:
        $(Expr(:inbounds, :pop)) # line 8:
        (Main.cumsum!)(cnts, cnts) # line 9:
        la = (Main.length)(a) # line 10:
        res = ((Core.apply_type)(Main.Vector, Main.Int))(la) # line 11:
        $(Expr(:inbounds, true))
        SSAValue(4) = (Main.colon)(la, -1, 1)
        #temp#@_10 = (Base.start)(SSAValue(4))
        32:
        unless !((Base.done)(SSAValue(4), #temp#@_10)) goto 48
        SSAValue(5) = (Base.next)(SSAValue(4), #temp#@_10)
        i = (Core.getfield)(SSAValue(5), 1)
        #temp#@_10 = (Core.getfield)(SSAValue(5), 2) # line 12:
        ai@_9 = ((Main.getindex)(a, i) - minval) + 1 # line 13:
        c = (Main.getindex)(cnts, ai@_9) # line 14:
        SSAValue(6) = (Main.getindex)(cnts, ai@_9) - 1
        (Main.setindex!)(cnts, SSAValue(6), ai@_9) # line 15:
        (Main.setindex!)(res, i, c)
        46:
        goto 32
        48:
        $(Expr(:inbounds, :pop)) # line 17:
        return res
    end))
Main> @code_typed sortperm_int_range2(a, 1_000_000, 1)
CodeInfo(:(begin
        NewvarNode(:(la))
        NewvarNode(:(res))
        cnts = $(Expr(:invoke, MethodInstance for fill!(::Array{Int64,1}, ::Int64), :(Base.fill!), :($(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Int64,1}, svec(Any, Int64), Array{Int64,1}, 0, :(rangelen), 0))), 0)) # line 4:
        $(Expr(:inbounds, true))
        #temp#@_6 = 1
        7:
        unless (Base.not_int)((#temp#@_6 === (Base.add_int)((Base.arraylen)(a)::Int64, 1)::Int64)::Bool)::Bool goto 19
        SSAValue(10) = (Base.arrayref)(a, #temp#@_6)::Int64
        SSAValue(11) = (Base.add_int)(#temp#@_6, 1)::Int64
        ai@_5 = SSAValue(10)
        #temp#@_6 = SSAValue(11) # line 5:
        SSAValue(2) = (Base.add_int)((Base.sub_int)(ai@_5, minval)::Int64, 1)::Int64
        SSAValue(3) = (Base.add_int)((Base.arrayref)(cnts, SSAValue(2))::Int64, 1)::Int64
        (Base.arrayset)(cnts, SSAValue(3), SSAValue(2))::Array{Int64,1}
        17:
        goto 7
        19:
        $(Expr(:inbounds, :pop)) # line 8:
        $(Expr(:invoke, MethodInstance for cumsum!(::Array{Int64,1}, ::Array{Int64,1}), :(Main.cumsum!), :(cnts), :(cnts))) # line 9:
        la = (Base.arraylen)(a)::Int64 # line 10:
        res = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Int64,1}, svec(Any, Int64), Array{Int64,1}, 0, :(la), 0)) # line 11:
        $(Expr(:inbounds, true))
        $(Expr(:inbounds, false))
        # meta: location range.jl colon 30
        $(Expr(:inbounds, false))
        # meta: location range.jl _colon 32
        # meta: location range.jl Type 145
        # meta: location range.jl Type 93
        SSAValue(9) = $(Expr(:invoke, MethodInstance for steprange_last(::Int64, ::Int64, ::Int64), :(Base.steprange_last), :(la), -1, 1))
        # meta: pop location
        # meta: pop location
        # meta: pop location
        $(Expr(:inbounds, :pop))
        # meta: pop location
        $(Expr(:inbounds, :pop))
        SSAValue(12) = la
        #temp#@_10 = SSAValue(12)
        44:
        unless (Base.not_int)((Base.or_int)((Base.and_int)((Base.not_int)((SSAValue(12) === SSAValue(9))::Bool)::Bool, (Base.not_int)(((Base.slt_int)(0, -1)::Bool === (Base.slt_int)(SSAValue(12), SSAValue(9))::Bool)::Bool)::Bool)::Bool, (#temp#@_10 === (Base.add_int)(SSAValue(9), -1)::Int64)::Bool)::Bool)::Bool goto 61
        SSAValue(13) = #temp#@_10
        SSAValue(14) = (Base.add_int)(#temp#@_10, -1)::Int64
        i = SSAValue(13)
        #temp#@_10 = SSAValue(14) # line 12:
        ai@_9 = (Base.add_int)((Base.sub_int)((Base.arrayref)(a, i)::Int64, minval)::Int64, 1)::Int64 # line 13:
        c = (Base.arrayref)(cnts, ai@_9)::Int64 # line 14:
        SSAValue(6) = (Base.sub_int)((Base.arrayref)(cnts, ai@_9)::Int64, 1)::Int64
        (Base.arrayset)(cnts, SSAValue(6), ai@_9)::Array{Int64,1} # line 15:
        (Base.arrayset)(res, i, c)::Array{Int64,1}
        59:
        goto 44
        61:
        $(Expr(:inbounds, :pop)) # line 17:
        return res
    end))=>Array{Int64,1}
Main> @code_llvm sortperm_int_range2(a, 1_000_000, 1)
; Function Attrs: uwtable
define i8** @julia_sortperm_int_range2_64498(i8** dereferenceable(40), i64, i64) #0 !dbg !5 {
top:
  %3 = call i8**** @jl_get_ptls_states() #4
  %4 = alloca [13 x i8**], align 8
  %.sub = getelementptr inbounds [13 x i8**], [13 x i8**]* %4, i64 0, i64 0
  %5 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 2
  %6 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 3
  %7 = bitcast [13 x i8**]* %4 to i64*
  %8 = bitcast i8*** %5 to i8*
  call void @llvm.memset.p0i8.i64(i8* %8, i8 0, i64 88, i32 8, i1 false)
  store i64 22, i64* %7, align 8
  %9 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 1
  %10 = bitcast i8**** %3 to i64*
  %11 = load i64, i64* %10, align 8
  %12 = bitcast i8*** %9 to i64*
  store i64 %11, i64* %12, align 8
  store i8*** %.sub, i8**** %3, align 8
  %13 = call i8** inttoptr (i64 1801121808 to i8** (i8**, i64)*)(i8** inttoptr (i64 162829584 to i8**), i64 %1)
  store i8** %13, i8*** %5, align 8
  %14 = call i8** @"jlsys_fill!_42240"(i8** %13, i64 0)
  store i8** %14, i8*** %6, align 8
  %15 = getelementptr inbounds i8*, i8** %0, i64 1
  %16 = bitcast i8** %15 to i64*
  %17 = load i64, i64* %16, align 8
  %18 = icmp eq i64 %17, 0
  br i1 %18, label %L19, label %if.lr.ph
if.lr.ph:                                         ; preds = %top
  %19 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 4
  %20 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 5
  %21 = bitcast i8** %0 to i64**
  %22 = load i64*, i64** %21, align 8
  %23 = bitcast i8** %14 to i64**
  %24 = load i64*, i64** %23, align 8
  br label %if
if:                                               ; preds = %if.lr.ph, %if
  %"#temp#.07" = phi i64 [ 1, %if.lr.ph ], [ %28, %if ]
  %25 = add i64 %"#temp#.07", -1
  %26 = getelementptr i64, i64* %22, i64 %25
  %27 = load i64, i64* %26, align 8
  %28 = add i64 %"#temp#.07", 1
  %29 = sub i64 %27, %2
  %30 = getelementptr i64, i64* %24, i64 %29
  %31 = load i64, i64* %30, align 8
  %32 = add i64 %31, 1
  store i64 %32, i64* %30, align 8
  %33 = icmp eq i64 %"#temp#.07", %17
  br i1 %33, label %L7.L19_crit_edge, label %if
L7.L19_crit_edge:                                 ; preds = %if
  store i8** %14, i8*** %19, align 8
  store i8** %14, i8*** %20, align 8
  br label %L19
L19:                                              ; preds = %L7.L19_crit_edge, %top
  %34 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 6
  %35 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 7
  %36 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 8
  store i8** %14, i8*** %34, align 8
  store i8** %14, i8*** %35, align 8
  %37 = call i8** @"julia_cumsum!_64499"(i8** %14, i8** %14)
  %38 = load i64, i64* %16, align 8
  %39 = call i8** inttoptr (i64 1801121808 to i8** (i8**, i64)*)(i8** inttoptr (i64 162829584 to i8**), i64 %38)
  store i8** %39, i8*** %36, align 8
  %40 = call i64 @jlsys_steprange_last_58768(i64 %38, i64 -1, i64 1)
  %41 = icmp slt i64 %38, %40
  %42 = zext i1 %41 to i8
  %43 = add i64 %40, -1
  %44 = icmp eq i64 %38, %43
  %45 = zext i1 %44 to i8
  %46 = or i8 %45, %42
  %47 = icmp eq i8 %46, 1
  br i1 %47, label %L61, label %if3.lr.ph
if3.lr.ph:                                        ; preds = %L19
  %48 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 9
  %49 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 10
  %50 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 11
  %51 = getelementptr [13 x i8**], [13 x i8**]* %4, i64 0, i64 12
  %52 = bitcast i8** %0 to i64**
  %53 = load i64*, i64** %52, align 8
  %54 = bitcast i8** %14 to i64**
  %55 = load i64*, i64** %54, align 8
  %56 = bitcast i8** %39 to i64**
  %57 = load i64*, i64** %56, align 8
  br label %if3
if3:                                              ; preds = %if3.lr.ph, %if3
  %"#temp#2.06" = phi i64 [ %38, %if3.lr.ph ], [ %58, %if3 ]
  %58 = add i64 %"#temp#2.06", -1
  %59 = getelementptr i64, i64* %53, i64 %58
  %60 = load i64, i64* %59, align 8
  %61 = sub i64 %60, %2
  %62 = getelementptr i64, i64* %55, i64 %61
  %63 = load i64, i64* %62, align 8
  %64 = add i64 %63, -1
  store i64 %64, i64* %62, align 8
  %65 = getelementptr i64, i64* %57, i64 %64
  store i64 %"#temp#2.06", i64* %65, align 8
  %66 = icmp eq i64 %58, %43
  %67 = zext i1 %66 to i8
  %68 = or i8 %67, %42
  %69 = icmp eq i8 %68, 1
  br i1 %69, label %L44.L61_crit_edge, label %if3
L44.L61_crit_edge:                                ; preds = %if3
  store i8** %14, i8*** %48, align 8
  store i8** %14, i8*** %49, align 8
  store i8** %14, i8*** %50, align 8
  store i8** %39, i8*** %51, align 8
  br label %L61
L61:                                              ; preds = %L44.L61_crit_edge, %L19
  %70 = load i64, i64* %12, align 8
  store i64 %70, i64* %10, align 8
  ret i8** %39
}
Main> @code_native sortperm_int_range2(a, 1_000_000, 1)
        .text
Filename: none
        pushq   %rbp
        movq    %rsp, %rbp
        pushq   %r15
        pushq   %r14
        pushq   %r13
        pushq   %r12
        pushq   %rsi
        pushq   %rdi
        pushq   %rbx
        subq    $152, %rsp
        movq    %r8, %r14
        movq    %rdx, %rsi
        movq    %rcx, %r15
        movabsq $jl_get_ptls_states, %rax
        callq   *%rax
        xorps   %xmm0, %xmm0
        movups  %xmm0, -88(%rbp)
        movups  %xmm0, -104(%rbp)
        movups  %xmm0, -120(%rbp)
        movups  %xmm0, -136(%rbp)
        movups  %xmm0, -152(%rbp)
        movq    $0, -72(%rbp)
        movq    $22, -168(%rbp)
        movq    (%rax), %rcx
        movq    %rcx, -160(%rbp)
        leaq    -168(%rbp), %rcx
        movq    %rax, -64(%rbp)
        movq    %rcx, (%rax)
Source line: 2
        movl    $jl_alloc_array_1d, %eax
        movl    $162829584, %ecx        # imm = 0x9B49510
        movq    %rsi, %rdx
        callq   *%rax
        movq    %rax, -152(%rbp)
        movabsq $"fill!", %rbx
        xorl    %edx, %edx
        movq    %rax, %rcx
        callq   *%rbx
        movq    %rax, %r13
        movq    %r13, -144(%rbp)
Source line: 4
        movq    8(%r15), %rax
        testq   %rax, %rax
        je      L222
        movq    (%r15), %rcx
Source line: 5
        movq    (%r13), %rdx
        nopw    %cs:(%rax,%rax)
Source line: 4
L192:
        movq    (%rcx), %rbx
Source line: 5
        subq    %r14, %rbx
        incq    jl_system_image_data(%rdx,%rbx,8)
Source line: 4
        addq    $8, %rcx
        decq    %rax
        jne     L192
Source line: 5
        movq    %r13, -136(%rbp)
        movq    %r13, -128(%rbp)
Source line: 8
L222:
        movq    %r13, -120(%rbp)
        movq    %r13, -112(%rbp)
        movabsq $"cumsum!", %rax
        movq    %r13, %rcx
        movq    %r13, %rdx
        callq   *%rax
Source line: 9
        movq    8(%r15), %rbx
Source line: 10
        movl    $jl_alloc_array_1d, %eax
        movl    $162829584, %ecx        # imm = 0x9B49510
        movq    %rbx, %rdx
        callq   *%rax
        movq    %rax, %r12
        movq    %r12, -104(%rbp)
Source line: 93
        movabsq $steprange_last, %rax
        movq    $-1, %rdx
        movl    $1, %r8d
        movq    %rbx, %rcx
        callq   *%rax
Source line: 11
        cmpq    %rax, %rbx
        setl    %r9b
        setge   %r8b
        leaq    -1(%rax), %rdi
        cmpq    %rdi, %rbx
        setne   %dl
        andb    %r8b, %dl
        cmpb    $1, %dl
        jne     L411
Source line: 12
        movq    (%r15), %r8
Source line: 13
        movq    (%r13), %rsi
Source line: 15
        movq    (%r12), %r10
        nopw    %cs:(%rax,%rax)
Source line: 12
L352:
        movq    -8(%r8,%rbx,8), %rcx
        subq    %r14, %rcx
Source line: 13
        movq    (%rsi,%rcx,8), %rdx
Source line: 14
        leaq    -1(%rdx), %rdi
        movq    %rdi, (%rsi,%rcx,8)
Source line: 15
        movq    %rbx, -8(%r10,%rdx,8)
Source line: 11
        cmpq    %rbx, %rax
        leaq    -1(%rbx), %rbx
        sete    %cl
        orb     %r9b, %cl
        cmpb    $1, %cl
        jne     L352
Source line: 13
        movq    %r13, -96(%rbp)
Source line: 14
        movq    %r13, -88(%rbp)
        movq    %r13, -80(%rbp)
Source line: 15
        movq    %r12, -72(%rbp)
Source line: 17
L411:
        movq    -160(%rbp), %rax
        movq    -64(%rbp), %rcx
        movq    %rax, (%rcx)
        movq    %r12, %rax
        addq    $152, %rsp
        popq    %rbx
        popq    %rdi
        popq    %rsi
        popq    %r12
        popq    %r13
        popq    %r14
        popq    %r15
        popq    %rbp
        retq