Is the triple `@inbounds @fastmath @simd` necessary for absolute peak performance?

This is not always the case, unfortunately. Only in simple cases can the compiler elide bounds-checks with eachindex. E.g.:

julia> using LinearAlgebra

julia> function f(D)
       s = zero(eltype(D))
       for i in eachindex(D)
           s += D[i]
       end
       s
       end
f (generic function with 1 method)

julia> D = Diagonal(1:3);

julia> @code_llvm f(D)

this indicates that the bounds checks are still present.

L28:                                              ; preds = %L65, %top
  %value_phi13 = phi i64 [ %value_phi51, %L65 ], [ 1, %top ]
  %value_phi14 = phi i64 [ %value_phi52, %L65 ], [ 1, %top ]
  %value_phi19 = phi i64 [ %9, %L65 ], [ 0, %top ]
;  @ REPL[2]:4 within `f`
; ┌ @ abstractarray.jl:1312 within `getindex`
; │┌ @ abstractarray.jl:1358 within `_getindex`
; ││┌ @ /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/diagonal.jl:177 within `getindex`
; │││┌ @ abstractarray.jl:699 within `checkbounds` @ abstractarray.jl:681
; ││││┌ @ abstractarray.jl:725 within `checkbounds_indices`
; │││││┌ @ abstractarray.jl:754 within `checkindex`
; ││││││┌ @ int.jl:86 within `-`
         %3 = add i64 %value_phi13, -1
; ││││││└
; ││││││┌ @ int.jl:513 within `<`
         %4 = icmp uge i64 %3, %1
; │││││└└
; │││││ @ abstractarray.jl:725 within `checkbounds_indices` @ abstractarray.jl:725
; │││││┌ @ abstractarray.jl:754 within `checkindex`
; ││││││┌ @ int.jl:86 within `-`
         %5 = add i64 %value_phi14, -1
; ││││││└
; ││││││┌ @ int.jl:513 within `<`
         %6 = icmp uge i64 %5, %1
; ││││└└└
; ││││ @ abstractarray.jl:699 within `checkbounds`
      %.not70 = or i1 %4, %6
      br i1 %.not70, label %L62, label %L65

L62:                                              ; preds = %L28
      %7 = getelementptr inbounds [2 x i64], ptr %"new::Tuple", i64 0, i64 1
; ││││ @ abstractarray.jl:697 within `checkbounds`
      store i64 %value_phi13, ptr %"new::Tuple", align 8
      store i64 %value_phi14, ptr %7, align 8
; ││││ @ abstractarray.jl:699 within `checkbounds`
      call void @j_throw_boundserror_1320(ptr nocapture nonnull readonly %"D::Diagonal", ptr nocapture nonnull readonly %"new::Tuple") #8
      unreachable
2 Likes