Also, you should be careful that the compiler may change the behavior in a loop. It didn’t in that case because bounds checking was left on and you didn’t use @simd
, but see how much the code changes:
function loop(f, data, cases)
result ::Int64 = 0
@inbounds @simd for i in 1:cases
result += f(data[i])
end
result
end
@code_llvm debuginfo=:none
define i64 @julia_loop_18052(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40), i64) {
top:
%2 = icmp sgt i64 %1, 0
%3 = select i1 %2, i64 %1, i64 0
%4 = add nsw i64 %3, -1
%5 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %4, i64 1)
%6 = extractvalue { i64, i1 } %5, 1
br i1 %6, label %L16, label %L21
L16: ; preds = %top
call void @julia_throw_overflowerr_binaryop_12551(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140605855187504 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 %4, i64 1)
call void @llvm.trap()
unreachable
L21: ; preds = %top
%7 = extractvalue { i64, i1 } %5, 0
%8 = icmp slt i64 %7, 1
br i1 %8, label %L63, label %L28.lr.ph
L28.lr.ph: ; preds = %L21
%9 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%10 = bitcast %jl_value_t addrspace(11)* %9 to i64 addrspace(13)* addrspace(11)*
%11 = load i64 addrspace(13)*, i64 addrspace(13)* addrspace(11)* %10, align 8
%min.iters.check = icmp ult i64 %3, 32
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L28.lr.ph
%n.vec = and i64 %3, 9223372036854775776
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i64> [ zeroinitializer, %vector.ph ], [ %36, %vector.body ]
%vec.phi19 = phi <8 x i64> [ zeroinitializer, %vector.ph ], [ %37, %vector.body ]
%vec.phi20 = phi <8 x i64> [ zeroinitializer, %vector.ph ], [ %38, %vector.body ]
%vec.phi21 = phi <8 x i64> [ zeroinitializer, %vector.ph ], [ %39, %vector.body ]
%12 = getelementptr inbounds i64, i64 addrspace(13)* %11, i64 %index
%13 = bitcast i64 addrspace(13)* %12 to <8 x i64> addrspace(13)*
%wide.load = load <8 x i64>, <8 x i64> addrspace(13)* %13, align 8
%14 = getelementptr inbounds i64, i64 addrspace(13)* %12, i64 8
%15 = bitcast i64 addrspace(13)* %14 to <8 x i64> addrspace(13)*
%wide.load22 = load <8 x i64>, <8 x i64> addrspace(13)* %15, align 8
%16 = getelementptr inbounds i64, i64 addrspace(13)* %12, i64 16
%17 = bitcast i64 addrspace(13)* %16 to <8 x i64> addrspace(13)*
%wide.load23 = load <8 x i64>, <8 x i64> addrspace(13)* %17, align 8
%18 = getelementptr inbounds i64, i64 addrspace(13)* %12, i64 24
%19 = bitcast i64 addrspace(13)* %18 to <8 x i64> addrspace(13)*
%wide.load24 = load <8 x i64>, <8 x i64> addrspace(13)* %19, align 8
%20 = ashr <8 x i64> %wide.load, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%21 = ashr <8 x i64> %wide.load22, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%22 = ashr <8 x i64> %wide.load23, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%23 = ashr <8 x i64> %wide.load24, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%24 = and <8 x i64> %20, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%25 = and <8 x i64> %21, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%26 = and <8 x i64> %22, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%27 = and <8 x i64> %23, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%28 = or <8 x i64> %24, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%29 = or <8 x i64> %25, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%30 = or <8 x i64> %26, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%31 = or <8 x i64> %27, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%32 = and <8 x i64> %28, %wide.load
%33 = and <8 x i64> %29, %wide.load22
%34 = and <8 x i64> %30, %wide.load23
%35 = and <8 x i64> %31, %wide.load24
%36 = add <8 x i64> %32, %vec.phi
%37 = add <8 x i64> %33, %vec.phi19
%38 = add <8 x i64> %34, %vec.phi20
%39 = add <8 x i64> %35, %vec.phi21
%index.next = add i64 %index, 32
%40 = icmp eq i64 %index.next, %n.vec
br i1 %40, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%bin.rdx = add <8 x i64> %37, %36
%bin.rdx25 = add <8 x i64> %38, %bin.rdx
%bin.rdx26 = add <8 x i64> %39, %bin.rdx25
%rdx.shuf = shufflevector <8 x i64> %bin.rdx26, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx27 = add <8 x i64> %bin.rdx26, %rdx.shuf
%rdx.shuf28 = shufflevector <8 x i64> %bin.rdx27, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx29 = add <8 x i64> %bin.rdx27, %rdx.shuf28
%rdx.shuf30 = shufflevector <8 x i64> %bin.rdx29, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx31 = add <8 x i64> %bin.rdx29, %rdx.shuf30
%41 = extractelement <8 x i64> %bin.rdx31, i32 0
%cmp.n = icmp eq i64 %3, %n.vec
br i1 %cmp.n, label %L63, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L28.lr.ph
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L28.lr.ph ]
%bc.merge.rdx = phi i64 [ %41, %middle.block ], [ 0, %L28.lr.ph ]
br label %L28
L28: ; preds = %scalar.ph, %L28
%value_phi215 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %49, %L28 ]
%value_phi14 = phi i64 [ %bc.merge.rdx, %scalar.ph ], [ %48, %L28 ]
%42 = getelementptr inbounds i64, i64 addrspace(13)* %11, i64 %value_phi215
%43 = load i64, i64 addrspace(13)* %42, align 8
%44 = ashr i64 %43, 63
%45 = and i64 %44, 248
%46 = or i64 %45, 7
%47 = and i64 %46, %43
%48 = add i64 %47, %value_phi14
%49 = add nuw nsw i64 %value_phi215, 1
%50 = icmp ult i64 %49, %7
br i1 %50, label %L28, label %L63
L63: ; preds = %L28, %middle.block, %L21
%value_phi5 = phi i64 [ 0, %L21 ], [ %48, %L28 ], [ %41, %middle.block ]
ret i64 %value_phi5
}
The chunk corresponding to the snippet from your post:
%20 = ashr <8 x i64> %wide.load, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%21 = ashr <8 x i64> %wide.load22, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%22 = ashr <8 x i64> %wide.load23, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%23 = ashr <8 x i64> %wide.load24, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
%24 = and <8 x i64> %20, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%25 = and <8 x i64> %21, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%26 = and <8 x i64> %22, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%27 = and <8 x i64> %23, <i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248, i64 248>
%28 = or <8 x i64> %24, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%29 = or <8 x i64> %25, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%30 = or <8 x i64> %26, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%31 = or <8 x i64> %27, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
%32 = and <8 x i64> %28, %wide.load
%33 = and <8 x i64> %29, %wide.load22
%34 = and <8 x i64> %30, %wide.load23
%35 = and <8 x i64> %31, %wide.load24
So it’s definitely worth checking out what happens when your code is placed in a loop.
IMO, it’s a lot easier to make code fast if you hold performance in mind from the beginning, than it is to optimize code as an after thought. Like you say, while micro-optimization can be left to a compiler, it definitely helps to know what the compiler is capable of/likely to do, and then set up your data structures and code to encourage it.
loopold
is the version without @inbounds @simd
, loop
is the version with it; example of using BenchmarkTools:
julia> data = rand(Int, 800);
julia> using BenchmarkTools
julia> @benchmark loopold(f2, $data, 800)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 476.301 ns (0.00% GC)
median time: 479.168 ns (0.00% GC)
mean time: 489.404 ns (0.00% GC)
maximum time: 763.898 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 196
julia> @benchmark loop(f2, $data, 800)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 65.316 ns (0.00% GC)
median time: 65.432 ns (0.00% GC)
mean time: 66.471 ns (0.00% GC)
maximum time: 102.284 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 979