Yes, I do think that when optimizing code SoA is better much more often than not.
It’s a hammer you can apply whenever your arrays are of length > 4; all you have to do is make sure the compiler is actually figuring it out.
Whenever the number of loop iterations is greater than the number of operations that can happen in parallel within a loop iteration, SoA is probably better.
Here, within a loop iteration, when we add tmp_x2
and tmp_y2
, we can’t vectorize that, so AoS is pretty much stuck. Versus just using SoA and calculating 4 (or more!) tmp_x2s
, tmp_y2s
, and tmp_xy
s at a time ([i:i+3]
).
If a loop iteration gets fancy enough so that maybe vectorization is possible, suddenly we have to start digging through everything that goes on, make sure we can actually load everything contiguously, and that there isn’t some hidden vectorization bottleneck. Or, make life easy, use SoA so that we know it’s possible to vectorize seemlessly, and then just check the @code_llvm
(yuyichao strongly reccomends against @code_native
; by actualy naming temporiaries, the llvm is definitely easier to read – added bonus is that discoure recognizes llvm as a language!).
A caveat is that maybe you’ll have to worry about register pressure if you’re getting really fancy.
There’s a great video somewhere of Keno talking about what he did with struct of arrays while working on Celeste.
julia> @code_llvm optimize=true debuginfo=:none normSoA(arrSoA)
; Function Attrs: uwtable
define double @julia_normSoA_16509(%jl_value_t addrspace(10)* nonnull align 8 dereferenceable(16)) #0 {
top:
%1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_value_t addrspace(10)* addrspace(11)*
%3 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %2, align 8
%4 = addrspacecast %jl_value_t addrspace(10)* %3 to %jl_value_t addrspace(11)*
%5 = bitcast %jl_value_t addrspace(11)* %4 to %jl_array_t addrspace(11)*
%6 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %5, i64 0, i32 1
%7 = load i64, i64 addrspace(11)* %6, align 8
%8 = icmp sgt i64 %7, 0
%9 = select i1 %8, i64 %7, i64 0
%10 = add nsw i64 %9, -1
%11 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %10, i64 1)
%12 = extractvalue { i64, i1 } %11, 1
br i1 %12, label %L18, label %L23
L18: ; preds = %top
call void @julia_throw_overflowerr_binaryop_12453(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 69001352 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 %10, i64 1)
call void @llvm.trap()
unreachable
L23: ; preds = %top
%13 = extractvalue { i64, i1 } %11, 0
%14 = icmp slt i64 %13, 1
br i1 %14, label %L67, label %L30.lr.ph
L30.lr.ph: ; preds = %L23
%15 = addrspacecast %jl_value_t addrspace(10)* %3 to %jl_value_t addrspace(11)*
%16 = bitcast %jl_value_t addrspace(11)* %15 to double addrspace(13)* addrspace(11)*
%17 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %16, align 8
%18 = bitcast %jl_value_t addrspace(11)* %1 to i8 addrspace(11)*
%19 = getelementptr inbounds i8, i8 addrspace(11)* %18, i64 8
%20 = bitcast i8 addrspace(11)* %19 to %jl_value_t addrspace(10)* addrspace(11)*
%21 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %20, align 8
%22 = addrspacecast %jl_value_t addrspace(10)* %21 to %jl_value_t addrspace(11)*
%23 = bitcast %jl_value_t addrspace(11)* %22 to double addrspace(13)* addrspace(11)*
%24 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %23, align 8
%min.iters.check = icmp ult i64 %9, 16
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L30.lr.ph
%n.vec = and i64 %9, 9223372036854775792
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %57, %vector.body ]
%vec.phi22 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %58, %vector.body ]
%vec.phi23 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %59, %vector.body ]
%vec.phi24 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %60, %vector.body ]
%25 = getelementptr inbounds double, double addrspace(13)* %17, i64 %index
%26 = bitcast double addrspace(13)* %25 to <4 x double> addrspace(13)*
%wide.load = load <4 x double>, <4 x double> addrspace(13)* %26, align 8
%27 = getelementptr double, double addrspace(13)* %25, i64 4
%28 = bitcast double addrspace(13)* %27 to <4 x double> addrspace(13)*
%wide.load25 = load <4 x double>, <4 x double> addrspace(13)* %28, align 8
%29 = getelementptr double, double addrspace(13)* %25, i64 8
%30 = bitcast double addrspace(13)* %29 to <4 x double> addrspace(13)*
%wide.load26 = load <4 x double>, <4 x double> addrspace(13)* %30, align 8
%31 = getelementptr double, double addrspace(13)* %25, i64 12
%32 = bitcast double addrspace(13)* %31 to <4 x double> addrspace(13)*
%wide.load27 = load <4 x double>, <4 x double> addrspace(13)* %32, align 8
%33 = getelementptr inbounds double, double addrspace(13)* %24, i64 %index
%34 = bitcast double addrspace(13)* %33 to <4 x double> addrspace(13)*
%wide.load28 = load <4 x double>, <4 x double> addrspace(13)* %34, align 8
%35 = getelementptr double, double addrspace(13)* %33, i64 4
%36 = bitcast double addrspace(13)* %35 to <4 x double> addrspace(13)*
%wide.load29 = load <4 x double>, <4 x double> addrspace(13)* %36, align 8
%37 = getelementptr double, double addrspace(13)* %33, i64 8
%38 = bitcast double addrspace(13)* %37 to <4 x double> addrspace(13)*
%wide.load30 = load <4 x double>, <4 x double> addrspace(13)* %38, align 8
%39 = getelementptr double, double addrspace(13)* %33, i64 12
%40 = bitcast double addrspace(13)* %39 to <4 x double> addrspace(13)*
%wide.load31 = load <4 x double>, <4 x double> addrspace(13)* %40, align 8
%41 = fmul <4 x double> %wide.load, %wide.load
%42 = fmul <4 x double> %wide.load25, %wide.load25
%43 = fmul <4 x double> %wide.load26, %wide.load26
%44 = fmul <4 x double> %wide.load27, %wide.load27
%45 = fmul <4 x double> %wide.load28, %wide.load28
%46 = fmul <4 x double> %wide.load29, %wide.load29
%47 = fmul <4 x double> %wide.load30, %wide.load30
%48 = fmul <4 x double> %wide.load31, %wide.load31
%49 = fadd <4 x double> %41, %45
%50 = fadd <4 x double> %42, %46
%51 = fadd <4 x double> %43, %47
%52 = fadd <4 x double> %44, %48
%53 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %49)
%54 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %50)
%55 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %51)
%56 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %52)
%57 = fadd fast <4 x double> %vec.phi, %53
%58 = fadd fast <4 x double> %vec.phi22, %54
%59 = fadd fast <4 x double> %vec.phi23, %55
%60 = fadd fast <4 x double> %vec.phi24, %56
%index.next = add i64 %index, 16
%61 = icmp eq i64 %index.next, %n.vec
br i1 %61, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%bin.rdx = fadd fast <4 x double> %58, %57
%bin.rdx32 = fadd fast <4 x double> %59, %bin.rdx
%bin.rdx33 = fadd fast <4 x double> %60, %bin.rdx32
%rdx.shuf = shufflevector <4 x double> %bin.rdx33, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx34 = fadd fast <4 x double> %bin.rdx33, %rdx.shuf
%rdx.shuf35 = shufflevector <4 x double> %bin.rdx34, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx36 = fadd fast <4 x double> %bin.rdx34, %rdx.shuf35
%62 = extractelement <4 x double> %bin.rdx36, i32 0
%cmp.n = icmp eq i64 %9, %n.vec
br i1 %cmp.n, label %L67, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L30.lr.ph
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L30.lr.ph ]
%bc.merge.rdx = phi double [ %62, %middle.block ], [ 0.000000e+00, %L30.lr.ph ]
br label %L30
L30: ; preds = %scalar.ph, %L30
%value_phi218 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %72, %L30 ]
%value_phi17 = phi double [ %bc.merge.rdx, %scalar.ph ], [ %71, %L30 ]
%63 = getelementptr inbounds double, double addrspace(13)* %17, i64 %value_phi218
%64 = load double, double addrspace(13)* %63, align 8
%65 = getelementptr inbounds double, double addrspace(13)* %24, i64 %value_phi218
%66 = load double, double addrspace(13)* %65, align 8
%67 = fmul double %64, %64
%68 = fmul double %66, %66
%69 = fadd double %67, %68
%70 = call double @llvm.sqrt.f64(double %69)
%71 = fadd fast double %value_phi17, %70
%72 = add nuw nsw i64 %value_phi218, 1
%73 = icmp ult i64 %72, %13
br i1 %73, label %L30, label %L67
L67: ; preds = %L30, %middle.block, %L23
%value_phi6 = phi double [ 0.000000e+00, %L23 ], [ %71, %L30 ], [ %62, %middle.block ]
ret double %value_phi6
}
The exciting bit:
%41 = fmul <4 x double> %wide.load, %wide.load
%42 = fmul <4 x double> %wide.load25, %wide.load25
%43 = fmul <4 x double> %wide.load26, %wide.load26
%44 = fmul <4 x double> %wide.load27, %wide.load27
%45 = fmul <4 x double> %wide.load28, %wide.load28
%46 = fmul <4 x double> %wide.load29, %wide.load29
%47 = fmul <4 x double> %wide.load30, %wide.load30
%48 = fmul <4 x double> %wide.load31, %wide.load31
%49 = fadd <4 x double> %41, %45
%50 = fadd <4 x double> %42, %46
%51 = fadd <4 x double> %43, %47
%52 = fadd <4 x double> %44, %48
%53 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %49)
%54 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %50)
%55 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %51)
%56 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %52)
%57 = fadd fast <4 x double> %vec.phi, %53
%58 = fadd fast <4 x double> %vec.phi22, %54
%59 = fadd fast <4 x double> %vec.phi23, %55
%60 = fadd fast <4 x double> %vec.phi24, %56
With all the <4 x double>
(SIMD vector of 4 Float64
s) that it’s doing what kristoffer.carlsson said, except that it unrolled that loop yet again by another factor of 4 (so it’s doing 4 copies of what he wrote) to also take advantage of OOO (out of order) execution.