LLVM determined using 2x doubles was more efficient than using 4x doubles
- Around the 8-minute mark.
You’d need to use fast math flags to get reassociation, which would be needed for 4x doubles. Those 2x doubles are just the real-imaginary pairs.
Try defining
Base.:+(x::MyComplex, y::MyComplex) = MyComplex(Base.FastMath.add_fast(x.real, y.real), Base.FastMath.add_fast(x.imag, y.imag))
That may give you <4 x double>
.
Test:
julia> struct MyComplex
real::Float64
imag::Float64
end
julia> Base.:+(x::MyComplex, y::MyComplex) = MyComplex(Base.FastMath.add_fast(x.real, y.real), Base.FastMath.add_fast(x.imag, y.imag))
julia> Base.:/(x::MyComplex, N::Int) = MyComplex(Base.FastMath.div_fast(x.real, N), Base.FastMath.div_fast(x.imag, N))
julia> average(x::AbstractArray{MyComplex}) = sum(x) / length(x)
average (generic function with 1 method)
julia> x = [MyComplex(rand(),rand()) for _ in 1:100];
julia> average(x)
MyComplex(0.5322661296777239, 0.49026719877886044)
julia> @code_llvm debuginfo=:none average(x)
define void @julia_average_884([2 x double]* noalias nocapture sret, %jl_value_t* nonnull align 16 dereferenceable(40)) {
top:
%2 = alloca %jl_value_t*, i32 4
%3 = alloca [2 x double], align 8
%4 = bitcast %jl_value_t* %1 to %jl_value_t**
%5 = getelementptr inbounds %jl_value_t*, %jl_value_t** %4, i64 3
%6 = bitcast %jl_value_t** %5 to i64*
%7 = load i64, i64* %6, align 8
%8 = icmp sgt i64 %7, 0
%9 = select i1 %8, i64 %7, i64 0
br i1 %8, label %L11, label %L9
L9: ; preds = %top
%10 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 0
store %jl_value_t* inttoptr (i64 139655503420368 to %jl_value_t*), %jl_value_t** %10
%11 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 1
store %jl_value_t* inttoptr (i64 139655522948096 to %jl_value_t*), %jl_value_t** %11
%12 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 2
store %jl_value_t* %1, %jl_value_t** %12
%13 = getelementptr %jl_value_t*, %jl_value_t** %2, i32 3
store %jl_value_t* inttoptr (i64 139655492231712 to %jl_value_t*), %jl_value_t** %13
%14 = call nonnull %jl_value_t* @jl_invoke(%jl_value_t* inttoptr (i64 139655502796576 to %jl_value_t*), %jl_value_t** %2, i32 4, %jl_value_t* inttoptr (i64 139655417829904 to %jl_value_t*))
call void @llvm.trap()
unreachable
L11: ; preds = %top
%15 = icmp eq i64 %9, 1
br i1 %15, label %L13, label %L15
L13: ; preds = %L11
%16 = bitcast %jl_value_t* %1 to [2 x double]**
%17 = load [2 x double]*, [2 x double]** %16, align 8
%.elt = getelementptr inbounds [2 x double], [2 x double]* %17, i64 0, i64 0
%18 = bitcast double* %.elt to <2 x double>*
%19 = load <2 x double>, <2 x double>* %18, align 8
br label %L47
L15: ; preds = %L11
%20 = icmp sgt i64 %9, 15
br i1 %20, label %L43, label %L17
L17: ; preds = %L15
%21 = bitcast %jl_value_t* %1 to [2 x double]**
%22 = load [2 x double]*, [2 x double]** %21, align 8
%.elt60 = getelementptr inbounds [2 x double], [2 x double]* %22, i64 0, i64 0
%23 = bitcast double* %.elt60 to <2 x double>*
%24 = load <2 x double>, <2 x double>* %23, align 8
%.elt64 = getelementptr inbounds [2 x double], [2 x double]* %22, i64 1, i64 0
%25 = bitcast double* %.elt64 to <2 x double>*
%26 = load <2 x double>, <2 x double>* %25, align 8
%27 = fadd fast <2 x double> %26, %24
%28 = icmp sgt i64 %9, 2
br i1 %28, label %L34.preheader, label %L47
L34.preheader: ; preds = %L17
%29 = icmp ugt i64 %9, 3
%umax = select i1 %29, i64 %9, i64 3
%30 = add i64 %umax, -2
%min.iters.check = icmp ult i64 %30, 32
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L34.preheader
%n.mod.vf = urem i64 %30, 32
%n.vec = sub i64 %30, %n.mod.vf
%ind.end = add i64 2, %n.vec
%31 = extractelement <2 x double> %27, i32 1
%32 = insertelement <8 x double> zeroinitializer, double %31, i32 0
%33 = extractelement <2 x double> %27, i32 0
%34 = insertelement <8 x double> zeroinitializer, double %33, i32 0
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <8 x double> [ %32, %vector.ph ], [ %55, %vector.body ]
%vec.phi82 = phi <8 x double> [ zeroinitializer, %vector.ph ], [ %56, %vector.body ]
%vec.phi83 = phi <8 x double> [ zeroinitializer, %vector.ph ], [ %57, %vector.body ]
%vec.phi84 = phi <8 x double> [ zeroinitializer, %vector.ph ], [ %58, %vector.body ]
%vec.phi85 = phi <8 x double> [ %34, %vector.ph ], [ %51, %vector.body ]
%vec.phi86 = phi <8 x double> [ zeroinitializer, %vector.ph ], [ %52, %vector.body ]
%vec.phi87 = phi <8 x double> [ zeroinitializer, %vector.ph ], [ %53, %vector.body ]
%vec.phi88 = phi <8 x double> [ zeroinitializer, %vector.ph ], [ %54, %vector.body ]
%offset.idx = add i64 2, %index
%35 = add i64 %offset.idx, 0
%36 = add i64 %offset.idx, 8
%37 = add i64 %offset.idx, 16
%38 = add i64 %offset.idx, 24
%39 = getelementptr inbounds [2 x double], [2 x double]* %22, i64 %35, i64 0
%40 = getelementptr inbounds [2 x double], [2 x double]* %22, i64 %36, i64 0
%41 = getelementptr inbounds [2 x double], [2 x double]* %22, i64 %37, i64 0
%42 = getelementptr inbounds [2 x double], [2 x double]* %22, i64 %38, i64 0
%43 = getelementptr inbounds double, double* %39, i32 0
%44 = bitcast double* %43 to <16 x double>*
%45 = getelementptr inbounds double, double* %40, i32 0
%46 = bitcast double* %45 to <16 x double>*
%47 = getelementptr inbounds double, double* %41, i32 0
%48 = bitcast double* %47 to <16 x double>*
%49 = getelementptr inbounds double, double* %42, i32 0
%50 = bitcast double* %49 to <16 x double>*
%wide.vec = load <16 x double>, <16 x double>* %44, align 8
%wide.vec89 = load <16 x double>, <16 x double>* %46, align 8
%wide.vec90 = load <16 x double>, <16 x double>* %48, align 8
%wide.vec91 = load <16 x double>, <16 x double>* %50, align 8
%strided.vec = shufflevector <16 x double> %wide.vec, <16 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%strided.vec92 = shufflevector <16 x double> %wide.vec89, <16 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%strided.vec93 = shufflevector <16 x double> %wide.vec90, <16 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%strided.vec94 = shufflevector <16 x double> %wide.vec91, <16 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%strided.vec95 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%strided.vec96 = shufflevector <16 x double> %wide.vec89, <16 x double> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%strided.vec97 = shufflevector <16 x double> %wide.vec90, <16 x double> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%strided.vec98 = shufflevector <16 x double> %wide.vec91, <16 x double> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%51 = fadd fast <8 x double> %strided.vec, %vec.phi85
%52 = fadd fast <8 x double> %strided.vec92, %vec.phi86
%53 = fadd fast <8 x double> %strided.vec93, %vec.phi87
%54 = fadd fast <8 x double> %strided.vec94, %vec.phi88
%55 = fadd fast <8 x double> %strided.vec95, %vec.phi
%56 = fadd fast <8 x double> %strided.vec96, %vec.phi82
%57 = fadd fast <8 x double> %strided.vec97, %vec.phi83
%58 = fadd fast <8 x double> %strided.vec98, %vec.phi84
%index.next = add i64 %index, 32
%59 = icmp eq i64 %index.next, %n.vec
br i1 %59, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%bin.rdx106 = fadd fast <8 x double> %52, %51
%bin.rdx107 = fadd fast <8 x double> %53, %bin.rdx106
%bin.rdx108 = fadd fast <8 x double> %54, %bin.rdx107
%rdx.shuf109 = shufflevector <8 x double> %bin.rdx108, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx110 = fadd fast <8 x double> %bin.rdx108, %rdx.shuf109
%rdx.shuf111 = shufflevector <8 x double> %bin.rdx110, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx112 = fadd fast <8 x double> %bin.rdx110, %rdx.shuf111
%rdx.shuf113 = shufflevector <8 x double> %bin.rdx112, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx114 = fadd fast <8 x double> %bin.rdx112, %rdx.shuf113
%60 = extractelement <8 x double> %bin.rdx114, i32 0
%bin.rdx = fadd fast <8 x double> %56, %55
%bin.rdx99 = fadd fast <8 x double> %57, %bin.rdx
%bin.rdx100 = fadd fast <8 x double> %58, %bin.rdx99
%rdx.shuf = shufflevector <8 x double> %bin.rdx100, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx101 = fadd fast <8 x double> %bin.rdx100, %rdx.shuf
%rdx.shuf102 = shufflevector <8 x double> %bin.rdx101, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx103 = fadd fast <8 x double> %bin.rdx101, %rdx.shuf102
%rdx.shuf104 = shufflevector <8 x double> %bin.rdx103, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx105 = fadd fast <8 x double> %bin.rdx103, %rdx.shuf104
%61 = extractelement <8 x double> %bin.rdx105, i32 0
%cmp.n = icmp eq i64 %30, %n.vec
%62 = insertelement <2 x double> undef, double %60, i32 0
%63 = insertelement <2 x double> %62, double %61, i32 1
br i1 %cmp.n, label %L47, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L34.preheader
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 2, %L34.preheader ]
%64 = phi <2 x double> [ %27, %L34.preheader ], [ %63, %middle.block ]
br label %L34
L34: ; preds = %L34, %scalar.ph
%value_phi74 = phi i64 [ %66, %L34 ], [ %bc.resume.val, %scalar.ph ]
%65 = phi <2 x double> [ %69, %L34 ], [ %64, %scalar.ph ]
%66 = add nuw nsw i64 %value_phi74, 1
%.elt68 = getelementptr inbounds [2 x double], [2 x double]* %22, i64 %value_phi74, i64 0
%67 = bitcast double* %.elt68 to <2 x double>*
%68 = load <2 x double>, <2 x double>* %67, align 8
%69 = fadd fast <2 x double> %68, %65
%70 = icmp ult i64 %66, %9
br i1 %70, label %L34, label %L47
L43: ; preds = %L15
call void @j_mapreduce_impl_885([2 x double]* noalias nocapture nonnull sret %3, %jl_value_t* nonnull %1, i64 1, i64 %9, i64 1024)
%.sroa.035.0..sroa_idx39 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
%71 = bitcast double* %.sroa.035.0..sroa_idx39 to <2 x double>*
%72 = load <2 x double>, <2 x double>* %71, align 8
br label %L47
L47: ; preds = %L43, %L34, %middle.block, %L17, %L13
%73 = phi <2 x double> [ %72, %L43 ], [ %19, %L13 ], [ %27, %L17 ], [ %63, %middle.block ], [ %69, %L34 ]
%74 = bitcast %jl_value_t* %1 to %jl_array_t*
%75 = getelementptr inbounds %jl_array_t, %jl_array_t* %74, i64 0, i32 1
%76 = load i64, i64* %75, align 8
%77 = sitofp i64 %76 to double
%78 = insertelement <2 x double> undef, double %77, i32 0
%79 = insertelement <2 x double> %78, double %77, i32 1
%80 = fdiv fast <2 x double> %73, %79
%.sroa.029.0..sroa_idx = getelementptr inbounds [2 x double], [2 x double]* %0, i64 0, i64 0
%81 = bitcast double* %.sroa.029.0..sroa_idx to <2 x double>*
store <2 x double> %80, <2 x double>* %81, align 8
ret void
}