If you look at the LLVM:
With `@inbounds`:
define double @julia_foo_5533(i64 signext %"n::Int64") #0 {
top:
%.not = icmp eq i64 %"n::Int64", 0
br i1 %.not, label %L3, label %L7
L3: ; preds = %top
%.instance = load atomic ptr, ptr getelementptr inbounds (ptr, ptr @"+Core.GenericMemory#5538.jit", i64 4) unordered, align 16
%.not79 = icmp eq ptr %.instance, null
br i1 %.not79, label %fail, label %L144
L7: ; preds = %top
%"Memory{Float64}[]" = call ptr @jl_alloc_genericmemory(ptr nonnull @"+Core.GenericMemory#5538.jit", i64 %"n::Int64")
%.data_ptr = getelementptr inbounds { i64, ptr }, ptr %"Memory{Float64}[]", i64 0, i32 1
%0 = load ptr, ptr %.data_ptr, align 8
%1 = icmp slt i64 %"n::Int64", 1
br i1 %1, label %pass21, label %L25.preheader
L25.preheader: ; preds = %L7
%2 = shl nuw i64 %"n::Int64", 3
call void @llvm.memset.p0.i64(ptr nonnull align 8 %0, i8 0, i64 %2, i1 false)
br label %pass21
L112.preheader: ; preds = %pass21
%min.iters.check = icmp ult i64 %11, 8
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L112.preheader
%n.vec = and i64 %11, -8
%ind.end = or i64 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%3 = getelementptr inbounds double, ptr %0, i64 %index
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %3, align 8
%4 = getelementptr inbounds double, ptr %3, i64 2
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %4, align 8
%5 = getelementptr inbounds double, ptr %3, i64 4
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %5, align 8
%6 = getelementptr inbounds double, ptr %3, i64 6
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %6, align 8
%index.next = add nuw i64 %index, 8
%7 = icmp eq i64 %index.next, %n.vec
br i1 %7, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %11, %n.vec
br i1 %cmp.n, label %L144, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L112.preheader
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L112.preheader ]
br label %L112
L112: ; preds = %L112, %scalar.ph
%value_phi33 = phi i64 [ %10, %L112 ], [ %bc.resume.val, %scalar.ph ]
%8 = add nsw i64 %value_phi33, -1
%9 = getelementptr inbounds double, ptr %0, i64 %8
store double 1.000000e+00, ptr %9, align 8
%.not84.not = icmp eq i64 %value_phi33, %11
%10 = add nuw nsw i64 %value_phi33, 1
br i1 %.not84.not, label %L144, label %L112
L144: ; preds = %pass21, %L112, %middle.block, %L3
ret double 0.000000e+00
fail: ; preds = %L3
%jl_undefref_exception = load ptr, ptr @jl_undefref_exception, align 8
call void @ijl_throw(ptr nonnull %jl_undefref_exception)
unreachable
pass21: ; preds = %L25.preheader, %L7
%11 = sdiv i64 %"n::Int64", 2
%12 = and i64 %"n::Int64", -2
%.not83 = icmp eq i64 %12, 2
%13 = icmp sgt i64 %"n::Int64", 3
%or.cond = or i1 %.not83, %13
br i1 %or.cond, label %L112.preheader, label %L144
}
and without:
define double @julia_foo_bounds_check_5557(i64 signext %"n::Int64") #0 {
top:
%gcframe1 = alloca [3 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe1, i8 0, i64 24, i1 true)
%"new::Tuple53" = alloca [1 x i64], align 8
%pgcstack = call ptr inttoptr (i64 6882921892 to ptr)(i64 261) #13
store i64 4, ptr %gcframe1, align 16
%task.gcstack = load ptr, ptr %pgcstack, align 8
%frame.prev = getelementptr inbounds ptr, ptr %gcframe1, i64 1
store ptr %task.gcstack, ptr %frame.prev, align 8
store ptr %gcframe1, ptr %pgcstack, align 8
%.not = icmp eq i64 %"n::Int64", 0
br i1 %.not, label %L3, label %L5
L3: ; preds = %top
%.instance = load atomic ptr, ptr getelementptr inbounds (ptr, ptr @"+Core.GenericMemory#5562.jit", i64 4) unordered, align 16
%.not79 = icmp eq ptr %.instance, null
br i1 %.not79, label %fail, label %L7
L5: ; preds = %top
%"Memory{Float64}[]" = call ptr @jl_alloc_genericmemory(ptr nonnull @"+Core.GenericMemory#5562.jit", i64 %"n::Int64")
br label %L7
L7: ; preds = %L5, %L3
%0 = phi ptr [ %"Memory{Float64}[]", %L5 ], [ %.instance, %L3 ]
%.data_ptr = getelementptr inbounds { i64, ptr }, ptr %0, i64 0, i32 1
%1 = load ptr, ptr %.data_ptr, align 8
%gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe1, i64 2
store ptr %0, ptr %gc_slot_addr_0, align 16
%ptls_field = getelementptr inbounds ptr, ptr %pgcstack, i64 2
%ptls_load = load ptr, ptr %ptls_field, align 8
%"new::Array" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_pool_alloc_instrumented(ptr %ptls_load, i32 800, i32 32, i64 5284829520) #10
%"new::Array.tag_addr" = getelementptr inbounds i64, ptr %"new::Array", i64 -1
store atomic i64 5284829520, ptr %"new::Array.tag_addr" unordered, align 8
%2 = getelementptr inbounds ptr, ptr %"new::Array", i64 1
store ptr %1, ptr %"new::Array", align 8
store ptr %0, ptr %2, align 8
%"new::Array.size_ptr" = getelementptr inbounds i8, ptr %"new::Array", i64 16
store i64 %"n::Int64", ptr %"new::Array.size_ptr", align 8
%3 = icmp slt i64 %"n::Int64", 1
br i1 %3, label %pass21, label %L25.preheader
L25.preheader: ; preds = %L7
%4 = shl nuw i64 %"n::Int64", 3
call void @llvm.memset.p0.i64(ptr nonnull align 8 %1, i8 0, i64 %4, i1 false)
br label %pass21
L112.preheader89: ; preds = %pass21
%5 = load ptr, ptr %"new::Array", align 8
%exit.mainloop.at = call i64 @llvm.umin.i64(i64 %15, i64 %"n::Int64")
%umax = call i64 @llvm.umax.i64(i64 %exit.mainloop.at, i64 1)
%min.iters.check = icmp ult i64 %umax, 8
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L112.preheader89
%n.vec = and i64 %umax, -8
%ind.end = or i64 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = getelementptr inbounds double, ptr %5, i64 %index
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %6, align 8
%7 = getelementptr inbounds double, ptr %6, i64 2
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %7, align 8
%8 = getelementptr inbounds double, ptr %6, i64 4
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %8, align 8
%9 = getelementptr inbounds double, ptr %6, i64 6
store <2 x double> <double 1.000000e+00, double 1.000000e+00>, ptr %9, align 8
%index.next = add nuw i64 %index, 8
%10 = icmp eq i64 %index.next, %n.vec
br i1 %10, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %umax, %n.vec
br i1 %cmp.n, label %main.exit.selector, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L112.preheader89
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L112.preheader89 ]
br label %L128
L125: ; preds = %L112.postloop
store i64 %value_phi33.postloop, ptr %"new::Tuple53", align 8
store ptr %"new::Array", ptr %gc_slot_addr_0, align 16
call void @j_throw_boundserror_5578(ptr nonnull %"new::Array", ptr nocapture nonnull readonly %"new::Tuple53") #3
unreachable
L128: ; preds = %L128, %scalar.ph
%value_phi33 = phi i64 [ %13, %L128 ], [ %bc.resume.val, %scalar.ph ]
%11 = add nsw i64 %value_phi33, -1
%12 = getelementptr inbounds double, ptr %5, i64 %11
store double 1.000000e+00, ptr %12, align 8
%13 = add nuw i64 %value_phi33, 1
%.not93 = icmp ult i64 %value_phi33, %exit.mainloop.at
br i1 %.not93, label %L128, label %main.exit.selector
main.exit.selector: ; preds = %L128, %middle.block
%value_phi33.lcssa = phi i64 [ %umax, %middle.block ], [ %value_phi33, %L128 ]
%.lcssa = phi i64 [ %ind.end, %middle.block ], [ %13, %L128 ]
%14 = icmp ult i64 %value_phi33.lcssa, %15
br i1 %14, label %L112.postloop, label %L144
L144: ; preds = %L128.postloop, %pass21, %main.exit.selector
%frame.prev106 = load ptr, ptr %frame.prev, align 8
store ptr %frame.prev106, ptr %pgcstack, align 8
ret double 0.000000e+00
fail: ; preds = %L3
%jl_undefref_exception = load ptr, ptr @jl_undefref_exception, align 8
call void @ijl_throw(ptr nonnull %jl_undefref_exception)
unreachable
pass21: ; preds = %L25.preheader, %L7
%15 = sdiv i64 %"n::Int64", 2
%16 = and i64 %"n::Int64", -2
%.not83 = icmp eq i64 %16, 2
%17 = icmp sgt i64 %"n::Int64", 3
%or.cond = or i1 %.not83, %17
br i1 %or.cond, label %L112.preheader89, label %L144
L112.postloop: ; preds = %L128.postloop, %main.exit.selector
%value_phi33.postloop = phi i64 [ %20, %L128.postloop ], [ %.lcssa, %main.exit.selector ]
%18 = add i64 %value_phi33.postloop, -1
%.not84.postloop = icmp ult i64 %18, %"n::Int64"
br i1 %.not84.postloop, label %L128.postloop, label %L125
L128.postloop: ; preds = %L112.postloop
%19 = getelementptr inbounds double, ptr %5, i64 %18
store double 1.000000e+00, ptr %19, align 8
%.not85.not.postloop = icmp eq i64 %value_phi33.postloop, %15
%20 = add i64 %value_phi33.postloop, 1
br i1 %.not85.not.postloop, label %L144, label %L112.postloop
}
You can see this extra bit at the top:
%gcframe1 = alloca [3 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe1, i8 0, i64 24, i1 true)
%"new::Tuple53" = alloca [1 x i64], align 8
%pgcstack = call ptr inttoptr (i64 6882921892 to ptr)(i64 261) #13
store i64 4, ptr %gcframe1, align 16
%task.gcstack = load ptr, ptr %pgcstack, align 8
I suppose this is only because the compiler can’t (yet) prove the function won’t throw an error so sets up the storage for error metadata. And yeah I guess 1.12 is better at that somehow.
It’s fun to play with LLVM effect settings for this to provide some hints:
julia> Base.@assume_effects(
:consistent,
:terminates_globally,
:noub,
function foo(n)
x = zeros(n)
for i = 1:nĂ·2; x[i] = 1; end
return sum(x)
end
)
julia> @btime foo(30)
1.167 ns (0 allocations: 0 bytes)
15.0
(Though it appears to now be cheating the problem :P)