Your code works now (and is tested as of LoopVectorization v0.3.6), but it is slower on my computer (with avx512) than just adding @inbounds
:
julia> using BenchmarkTools, LoopVectorization
julia> maxdeg = 20; dim = 10; nbasis = 1_000;
julia> P = rand(dim, maxdeg + 1);
julia> function mvp(P, basis, coeffs::Vector{T}) where {T}
len_c = length(coeffs)
len_P = size(P, 1)
p = zero(T)
for n = 1:len_c
pn = coeffs[n]
for a = 1:len_P
pn *= P[a, basis[a, n]]
end
p += pn
end
return p
end
mvp (generic function with 1 method)
julia> function mvpinbounds(P, basis, coeffs::Vector{T}) where {T}
len_c = length(coeffs)
len_P = size(P, 1)
p = zero(T)
@inbounds for n = 1:len_c
pn = coeffs[n]
for a = 1:len_P
pn *= P[a, basis[a, n]]
end
p += pn
end
return p
end
mvpinbounds (generic function with 1 method)
julia> function mvpavx(P, basis, coeffs::Vector{T}) where {T}
len_c = length(coeffs)
len_P = size(P, 1)
p = zero(T)
@avx for n = 1:len_c
pn = coeffs[n]
for a = 1:len_P
pn *= P[a, basis[a, n]]
end
p += pn
end
return p
end
mvpavx (generic function with 1 method)
julia> basis = rand(1:(maxdeg+1), (dim, nbasis));
julia> coeffs = rand(nbasis);
julia> @btime mvp($P, $basis, $coeffs)
9.464 μs (0 allocations: 0 bytes)
0.8416355183275668
julia> @btime mvpinbounds($P, $basis, $coeffs)
5.322 μs (0 allocations: 0 bytes)
0.8416355183275668
julia> @btime mvpavx($P, $basis, $coeffs)
7.692 μs (0 allocations: 0 bytes)
0.8416355183275668
julia> nbasis = 10_000;
julia> basis = rand(1:(maxdeg+1), (dim, nbasis));
julia> coeffs = rand(nbasis);
julia> @btime mvp($P, $basis, $coeffs)
97.394 μs (0 allocations: 0 bytes)
9.049892658664312
julia> @btime mvpinbounds($P, $basis, $coeffs)
55.024 μs (0 allocations: 0 bytes)
9.049892658664312
julia> @btime mvpavx($P, $basis, $coeffs)
71.441 μs (0 allocations: 0 bytes)
9.049892658664312
You can also confirm with @code_native
or @code_llvm
that the @inbounds
version is not vectorized, while the @avx
is (but is slower anyway):
julia> @code_llvm debuginfo=:none mvpinbounds(P, basis, coeffs)
define double @julia_mvpinbounds_18548(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40), %jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40), %jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
%3 = addrspacecast %jl_value_t addrspace(10)* %2 to %jl_value_t addrspace(11)*
%4 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
%5 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %4, i64 0, i32 1
%6 = load i64, i64 addrspace(11)* %5, align 8
%7 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%8 = bitcast %jl_value_t addrspace(11)* %7 to %jl_value_t addrspace(10)* addrspace(11)*
%9 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %8, i64 3
%10 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %9 to i64 addrspace(11)*
%11 = load i64, i64 addrspace(11)* %10, align 8
%12 = icmp sgt i64 %6, 0
%13 = select i1 %12, i64 %6, i64 0
br i1 %12, label %L14.preheader, label %L59
L14.preheader: ; preds = %top
%14 = bitcast %jl_value_t addrspace(11)* %3 to double addrspace(13)* addrspace(11)*
%15 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %14, align 8
%16 = icmp sgt i64 %11, 0
%17 = select i1 %16, i64 %11, i64 0
%18 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
%19 = bitcast %jl_value_t addrspace(11)* %18 to %jl_value_t addrspace(10)* addrspace(11)*
%20 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %19, i64 3
%21 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %20 to i64 addrspace(11)*
%22 = load i64, i64 addrspace(11)* %21, align 8
%23 = bitcast %jl_value_t addrspace(11)* %18 to i64 addrspace(13)* addrspace(11)*
%24 = load i64 addrspace(13)*, i64 addrspace(13)* addrspace(11)* %23, align 8
%25 = bitcast %jl_value_t addrspace(11)* %7 to double addrspace(13)* addrspace(11)*
%26 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %25, align 8
br i1 %16, label %L29.preheader.us, label %L46
L29.preheader.us: ; preds = %L14.preheader, %L46.us
%value_phi3.us = phi double [ %43, %L46.us ], [ 0.000000e+00, %L14.preheader ]
%value_phi4.us = phi i64 [ %45, %L46.us ], [ 1, %L14.preheader ]
%27 = add nsw i64 %value_phi4.us, -1
%28 = getelementptr inbounds double, double addrspace(13)* %15, i64 %27
%29 = load double, double addrspace(13)* %28, align 8
%30 = mul i64 %22, %27
br label %L29.us
L29.us: ; preds = %L29.us, %L29.preheader.us
%value_phi9.us = phi double [ %40, %L29.us ], [ %29, %L29.preheader.us ]
%value_phi10.us = phi i64 [ %42, %L29.us ], [ 1, %L29.preheader.us ]
%31 = add nsw i64 %value_phi10.us, -1
%32 = add i64 %31, %30
%33 = getelementptr inbounds i64, i64 addrspace(13)* %24, i64 %32
%34 = load i64, i64 addrspace(13)* %33, align 8
%35 = add i64 %34, -1
%36 = mul i64 %35, %11
%37 = add i64 %31, %36
%38 = getelementptr inbounds double, double addrspace(13)* %26, i64 %37
%39 = load double, double addrspace(13)* %38, align 8
%40 = fmul double %value_phi9.us, %39
%41 = icmp eq i64 %value_phi10.us, %17
%42 = add nuw i64 %value_phi10.us, 1
br i1 %41, label %L46.us, label %L29.us
L46.us: ; preds = %L29.us
%43 = fadd double %value_phi3.us, %40
%44 = icmp eq i64 %value_phi4.us, %13
%45 = add nuw i64 %value_phi4.us, 1
br i1 %44, label %L59, label %L29.preheader.us
L46: ; preds = %L14.preheader, %L46
%value_phi3 = phi double [ %49, %L46 ], [ 0.000000e+00, %L14.preheader ]
%value_phi4 = phi i64 [ %51, %L46 ], [ 1, %L14.preheader ]
%46 = add nsw i64 %value_phi4, -1
%47 = getelementptr inbounds double, double addrspace(13)* %15, i64 %46
%48 = load double, double addrspace(13)* %47, align 8
%49 = fadd double %value_phi3, %48
%50 = icmp eq i64 %value_phi4, %13
%51 = add nuw i64 %value_phi4, 1
br i1 %50, label %L59, label %L46
L59: ; preds = %L46, %L46.us, %top
%value_phi19 = phi double [ 0.000000e+00, %top ], [ %43, %L46.us ], [ %49, %L46 ]
ret double %value_phi19
}
vs
;; julia> @code_llvm debuginfo=:none mvpavx(P, basis, coeffs)
define double @julia_mvpavx_18517(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40), %jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40), %jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
%3 = addrspacecast %jl_value_t addrspace(10)* %2 to %jl_value_t addrspace(11)*
%4 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
%5 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %4, i64 0, i32 1
%6 = load i64, i64 addrspace(11)* %5, align 8
%7 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%8 = bitcast %jl_value_t addrspace(11)* %7 to %jl_value_t addrspace(10)* addrspace(11)*
%9 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %8, i64 3
%10 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %9 to i64 addrspace(11)*
%11 = load i64, i64 addrspace(11)* %10, align 8
%12 = addrspacecast %jl_value_t addrspace(11)* %3 to %jl_value_t*
%13 = bitcast %jl_value_t* %12 to i64*
%14 = load i64, i64* %13, align 8
%15 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
%16 = addrspacecast %jl_value_t addrspace(11)* %15 to %jl_value_t*
%17 = bitcast %jl_value_t* %16 to i64*
%18 = load i64, i64* %17, align 8
%19 = bitcast %jl_value_t addrspace(11)* %15 to %jl_value_t addrspace(10)* addrspace(11)*
%20 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %19, i64 3
%21 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %20 to i64 addrspace(11)*
%22 = load i64, i64 addrspace(11)* %21, align 8
%23 = addrspacecast %jl_value_t addrspace(11)* %7 to %jl_value_t*
%24 = bitcast %jl_value_t* %23 to i64*
%25 = load i64, i64* %24, align 8
%26 = trunc i64 %11 to i8
%27 = and i8 %26, 7
%notmask = shl nsw i8 -1, %27
%28 = xor i8 %notmask, -1
%29 = icmp sgt i64 %6, 0
br i1 %29, label %L28.lr.ph, label %L179
L28.lr.ph: ; preds = %top
%30 = add i64 %11, -7
%31 = icmp sgt i64 %30, 0
%ptr.i = inttoptr i64 %18 to i64*
%ie.i241 = insertelement <8 x i64> undef, i64 %11, i32 0
%v.i242 = shufflevector <8 x i64> %ie.i241, <8 x i64> undef, <8 x i32> zeroinitializer
%ptr.i233 = inttoptr i64 %25 to double*
%ptr.i194282 = inttoptr i64 %14 to double*
%mask.i222 = bitcast i8 %28 to <8 x i1>
br i1 %31, label %L44.lr.ph.us, label %L93
L44.lr.ph.us: ; preds = %L28.lr.ph, %L175.us
%value_phi1361.us = phi double [ %40, %L175.us ], [ 0.000000e+00, %L28.lr.ph ]
%value_phi355.us = phi i64 [ %41, %L175.us ], [ 0, %L28.lr.ph ]
%.sroa.0117.0354.us = phi <8 x double> [ %.sroa.0113.0.us, %L175.us ], [ undef, %L28.lr.ph ]
%.sroa.0117.0.vec.insert.us = insertelement <8 x double> %.sroa.0117.0354.us, double 1.000000e+00, i32 0
%32 = mul i64 %value_phi355.us, %22
br label %L44.us
L44.us: ; preds = %L86.us, %L44.lr.ph.us
%tindex_phi348.us = phi i8 [ 1, %L44.lr.ph.us ], [ 2, %L86.us ]
%value_phi9347.us = phi <8 x double> [ undef, %L44.lr.ph.us ], [ %.sroa.0113.0.us, %L86.us ]
%value_phi2346.us = phi i64 [ 0, %L44.lr.ph.us ], [ %34, %L86.us ]
%.sroa.0117.1345.us = phi <8 x double> [ %.sroa.0117.0.vec.insert.us, %L44.lr.ph.us ], [ %.sroa.0113.0.us, %L86.us ]
%33 = add i64 %value_phi2346.us, %32
%offsetptr.i.us = getelementptr inbounds i64, i64* %ptr.i, i64 %33
%ptr.i244.us = bitcast i64* %offsetptr.i.us to <8 x i64>*
%res.i245.us = load <8 x i64>, <8 x i64>* %ptr.i244.us, align 8
%res.i243.us = add nsw <8 x i64> %res.i245.us, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
%res.i240.us = mul nsw <8 x i64> %res.i243.us, %v.i242
%ie.i238.us = insertelement <8 x i64> undef, i64 %value_phi2346.us, i32 0
%v.i239.us = shufflevector <8 x i64> %ie.i238.us, <8 x i64> undef, <8 x i32> zeroinitializer
%res.i237.us = add nsw <8 x i64> %v.i239.us, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%res.i236.us = add nsw <8 x i64> %res.i237.us, %res.i240.us
%offsetptr.i234.us = getelementptr inbounds double, double* %ptr.i233, <8 x i64> %res.i236.us
%res.i232.us = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %offsetptr.i234.us, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef)
switch i8 %tindex_phi348.us, label %L84 [
i8 2, label %L86.us
i8 1, label %L77.us
]
L77.us: ; preds = %L44.us
%v.i229.us = shufflevector <8 x double> %.sroa.0117.1345.us, <8 x double> undef, <8 x i32> zeroinitializer
br label %L86.us
L86.us: ; preds = %L77.us, %L44.us
%v.i229.pn.us = phi <8 x double> [ %v.i229.us, %L77.us ], [ %value_phi9347.us, %L44.us ]
%.sroa.0113.0.us = fmul reassoc nnan ninf nsz arcp contract <8 x double> %res.i232.us, %v.i229.pn.us
%34 = add i64 %value_phi2346.us, 8
%35 = icmp slt i64 %34, %30
br i1 %35, label %L44.us, label %L93.us
L93.us: ; preds = %L86.us
%36 = icmp slt i64 %34, %11
br i1 %36, label %L139.us, label %L175.us
L139.us: ; preds = %L93.us
%ie.i215.us = insertelement <8 x i64> undef, i64 %34, i32 0
%v.i216.us = shufflevector <8 x i64> %ie.i215.us, <8 x i64> undef, <8 x i32> zeroinitializer
%res.i214.us = add nsw <8 x i64> %v.i216.us, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%37 = mul i64 %value_phi355.us, %22
%38 = add i64 %34, %37
%offsetptr.i225.us = getelementptr inbounds i64, i64* %ptr.i, i64 %38
%ptr.i221.us = bitcast i64* %offsetptr.i225.us to <8 x i64>*
%res.i223.us = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* nonnull %ptr.i221.us, i32 8, <8 x i1> %mask.i222, <8 x i64> zeroinitializer)
%res.i220.us = add nsw <8 x i64> %res.i223.us, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
%res.i217.us = mul nsw <8 x i64> %res.i220.us, %v.i242
%res.i213.us = add nsw <8 x i64> %res.i214.us, %res.i217.us
%offsetptr.i211.us = getelementptr inbounds double, double* %ptr.i233, <8 x i64> %res.i213.us
%res.i209.us = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %offsetptr.i211.us, i32 8, <8 x i1> %mask.i222, <8 x double> zeroinitializer)
%value_phi17266.us = fmul reassoc nnan ninf nsz arcp contract <8 x double> %.sroa.0113.0.us, %res.i209.us
%res.i202.us = select <8 x i1> %mask.i222, <8 x double> %value_phi17266.us, <8 x double> %.sroa.0113.0.us
br label %L175.us
L175.us: ; preds = %L93.us, %L139.us
%value_phi20.ph.us = phi <8 x double> [ %res.i202.us, %L139.us ], [ %.sroa.0113.0.us, %L93.us ]
%offsetptr.i195272.us = getelementptr inbounds double, double* %ptr.i194282, i64 %value_phi355.us
%res.i193273.us = load double, double* %offsetptr.i195272.us, align 1
%vec_4_1.i182.us = shufflevector <8 x double> %value_phi20.ph.us, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%vec_4_2.i183.us = shufflevector <8 x double> %value_phi20.ph.us, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vec_4.i184.us = fmul <4 x double> %vec_4_1.i182.us, %vec_4_2.i183.us
%vec_2_1.i185.us = shufflevector <4 x double> %vec_4.i184.us, <4 x double> undef, <2 x i32> <i32 0, i32 1>
%vec_2_2.i186.us = shufflevector <4 x double> %vec_4.i184.us, <4 x double> undef, <2 x i32> <i32 2, i32 3>
%vec_2.i187.us = fmul <2 x double> %vec_2_1.i185.us, %vec_2_2.i186.us
%vec_1_1.i188.us = shufflevector <2 x double> %vec_2.i187.us, <2 x double> undef, <1 x i32> zeroinitializer
%vec_1_2.i189.us = shufflevector <2 x double> %vec_2.i187.us, <2 x double> undef, <1 x i32> <i32 1>
%vec_1.i190.us = fmul <1 x double> %vec_1_1.i188.us, %vec_1_2.i189.us
%res.i191.us = extractelement <1 x double> %vec_1.i190.us, i32 0
%39 = fmul fast double %res.i191.us, %res.i193273.us
%40 = fadd double %value_phi1361.us, %39
%41 = add nuw nsw i64 %value_phi355.us, 1
%exitcond369 = icmp eq i64 %41, %6
br i1 %exitcond369, label %L331, label %L44.lr.ph.us
L84: ; preds = %L44.us
call void @jl_throw(%jl_value_t addrspace(12)* addrspacecast (%jl_value_t* inttoptr (i64 140677114703680 to %jl_value_t*) to %jl_value_t addrspace(12)*))
unreachable
L93: ; preds = %L28.lr.ph, %L175
%value_phi1361 = phi double [ %45, %L175 ], [ 0.000000e+00, %L28.lr.ph ]
%value_phi355 = phi i64 [ %46, %L175 ], [ 0, %L28.lr.ph ]
%.sroa.0117.0354 = phi <8 x double> [ %.sroa.0117.0.vec.insert, %L175 ], [ undef, %L28.lr.ph ]
%.sroa.097.0353 = phi <8 x double> [ %.sroa.097.1274, %L175 ], [ undef, %L28.lr.ph ]
%.sroa.0117.0.vec.insert = insertelement <8 x double> %.sroa.0117.0354, double 1.000000e+00, i32 0
%42 = icmp sgt i64 %11, 0
br i1 %42, label %L164, label %L170
L164: ; preds = %L93
%43 = mul i64 %value_phi355, %22
%offsetptr.i225 = getelementptr inbounds i64, i64* %ptr.i, i64 %43
%ptr.i221 = bitcast i64* %offsetptr.i225 to <8 x i64>*
%res.i223 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* nonnull %ptr.i221, i32 8, <8 x i1> %mask.i222, <8 x i64> zeroinitializer)
%res.i220 = add nsw <8 x i64> %res.i223, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
%res.i217 = mul nsw <8 x i64> %res.i220, %v.i242
%res.i213 = add nsw <8 x i64> %res.i217, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%offsetptr.i211 = getelementptr inbounds double, double* %ptr.i233, <8 x i64> %res.i213
%res.i209 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %offsetptr.i211, i32 8, <8 x i1> %mask.i222, <8 x double> zeroinitializer)
%v.i205 = shufflevector <8 x double> %.sroa.0117.0.vec.insert, <8 x double> undef, <8 x i32> zeroinitializer
%value_phi17 = fmul reassoc nnan ninf nsz arcp contract <8 x double> %v.i205, %res.i209
%res.i198 = select <8 x i1> %mask.i222, <8 x double> %value_phi17, <8 x double> %v.i205
%offsetptr.i195272 = getelementptr inbounds double, double* %ptr.i194282, i64 %value_phi355
%res.i193273 = load double, double* %offsetptr.i195272, align 1
%vec_4_1.i182 = shufflevector <8 x double> %res.i198, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%vec_4_2.i183 = shufflevector <8 x double> %res.i198, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vec_4.i184 = fmul <4 x double> %vec_4_1.i182, %vec_4_2.i183
%vec_2_1.i185 = shufflevector <4 x double> %vec_4.i184, <4 x double> undef, <2 x i32> <i32 0, i32 1>
%vec_2_2.i186 = shufflevector <4 x double> %vec_4.i184, <4 x double> undef, <2 x i32> <i32 2, i32 3>
%vec_2.i187 = fmul <2 x double> %vec_2_1.i185, %vec_2_2.i186
%vec_1_1.i188 = shufflevector <2 x double> %vec_2.i187, <2 x double> undef, <1 x i32> zeroinitializer
%vec_1_2.i189 = shufflevector <2 x double> %vec_2.i187, <2 x double> undef, <1 x i32> <i32 1>
%vec_1.i190 = fmul <1 x double> %vec_1_1.i188, %vec_1_2.i189
%res.i191 = extractelement <1 x double> %vec_1.i190, i32 0
%44 = fmul fast double %res.i191, %res.i193273
br label %L175
L170: ; preds = %L93
%.sroa.097.0.vec.insert103 = insertelement <8 x double> %.sroa.097.0353, double 1.000000e+00, i32 0
%offsetptr.i195283 = getelementptr inbounds double, double* %ptr.i194282, i64 %value_phi355
%res.i193284 = load double, double* %offsetptr.i195283, align 1
br label %L175
L175: ; preds = %L170, %L164
%.sroa.097.1274 = phi <8 x double> [ %res.i198, %L164 ], [ %.sroa.097.0.vec.insert103, %L170 ]
%value_phi23 = phi double [ %44, %L164 ], [ %res.i193284, %L170 ]
%45 = fadd double %value_phi1361, %value_phi23
%46 = add nuw nsw i64 %value_phi355, 1
%exitcond = icmp eq i64 %46, %6
br i1 %exitcond, label %L331, label %L93
L179: ; preds = %top
%47 = icmp eq i64 %6, 0
br i1 %47, label %L331, label %L183.preheader
L183.preheader: ; preds = %L179
%48 = add i64 %11, -7
%49 = icmp sgt i64 %48, 0
br i1 %49, label %L198.lr.ph, label %L247
L198.lr.ph: ; preds = %L183.preheader
%ptr.i179 = inttoptr i64 %18 to i64*
%ie.i174 = insertelement <8 x i64> undef, i64 %11, i32 0
%v.i175 = shufflevector <8 x i64> %ie.i174, <8 x i64> undef, <8 x i32> zeroinitializer
%ptr.i166 = inttoptr i64 %25 to double*
br label %L198
L198: ; preds = %L198.lr.ph, %L240
%tindex_phi41341 = phi i8 [ 1, %L198.lr.ph ], [ 2, %L240 ]
%value_phi37340 = phi <8 x double> [ undef, %L198.lr.ph ], [ %.sroa.085.0, %L240 ]
%value_phi30339 = phi i64 [ 0, %L198.lr.ph ], [ %50, %L240 ]
%.sroa.089.0338 = phi <8 x double> [ <double 1.000000e+00, double undef, double undef, double undef, double undef, double undef, double undef, double undef>, %L198.lr.ph ], [ %.sroa.085.0, %L240 ]
%offsetptr.i180 = getelementptr inbounds i64, i64* %ptr.i179, i64 %value_phi30339
%ptr.i177 = bitcast i64* %offsetptr.i180 to <8 x i64>*
%res.i178 = load <8 x i64>, <8 x i64>* %ptr.i177, align 8
%res.i176 = add nsw <8 x i64> %res.i178, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
%res.i173 = mul nsw <8 x i64> %res.i176, %v.i175
%ie.i171 = insertelement <8 x i64> undef, i64 %value_phi30339, i32 0
%v.i172 = shufflevector <8 x i64> %ie.i171, <8 x i64> undef, <8 x i32> zeroinitializer
%res.i170 = add nsw <8 x i64> %v.i172, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%res.i169 = add nsw <8 x i64> %res.i170, %res.i173
%offsetptr.i167 = getelementptr inbounds double, double* %ptr.i166, <8 x i64> %res.i169
%res.i165 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %offsetptr.i167, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef)
switch i8 %tindex_phi41341, label %L238 [
i8 2, label %L240
i8 1, label %L231
]
L231: ; preds = %L198
%v.i162 = shufflevector <8 x double> %.sroa.089.0338, <8 x double> undef, <8 x i32> zeroinitializer
br label %L240
L238: ; preds = %L198
call void @jl_throw(%jl_value_t addrspace(12)* addrspacecast (%jl_value_t* inttoptr (i64 140677114703680 to %jl_value_t*) to %jl_value_t addrspace(12)*))
unreachable
L240: ; preds = %L198, %L231
%v.i162.pn = phi <8 x double> [ %v.i162, %L231 ], [ %value_phi37340, %L198 ]
%.sroa.085.0 = fmul reassoc nnan ninf nsz arcp contract <8 x double> %res.i165, %v.i162.pn
%50 = add i64 %value_phi30339, 8
%51 = icmp slt i64 %50, %48
br i1 %51, label %L198, label %L247
L247: ; preds = %L240, %L183.preheader
%.sroa.089.0.lcssa = phi <8 x double> [ <double 1.000000e+00, double undef, double undef, double undef, double undef, double undef, double undef, double undef>, %L183.preheader ], [ %.sroa.085.0, %L240 ]
%value_phi30.lcssa = phi i64 [ 0, %L183.preheader ], [ %50, %L240 ]
%value_phi37.lcssa = phi <8 x double> [ undef, %L183.preheader ], [ %.sroa.085.0, %L240 ]
%tindex_phi41.lcssa = phi i8 [ 1, %L183.preheader ], [ 2, %L240 ]
%52 = icmp slt i64 %value_phi30.lcssa, %11
br i1 %52, label %L249, label %L247.L310_crit_edge
L247.L310_crit_edge: ; preds = %L247
switch i8 %tindex_phi41.lcssa, label %L310 [
i8 1, label %L322.thread
i8 2, label %L310.thread
]
L249: ; preds = %L247
%ptr.i157 = inttoptr i64 %18 to i64*
%offsetptr.i158 = getelementptr inbounds i64, i64* %ptr.i157, i64 %value_phi30.lcssa
%ptr.i154 = bitcast i64* %offsetptr.i158 to <8 x i64>*
%mask.i155 = bitcast i8 %28 to <8 x i1>
%res.i156 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* nonnull %ptr.i154, i32 8, <8 x i1> %mask.i155, <8 x i64> zeroinitializer)
%res.i153 = add nsw <8 x i64> %res.i156, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
%ie.i151 = insertelement <8 x i64> undef, i64 %11, i32 0
%v.i152 = shufflevector <8 x i64> %ie.i151, <8 x i64> undef, <8 x i32> zeroinitializer
%res.i150 = mul nsw <8 x i64> %res.i153, %v.i152
%ie.i148 = insertelement <8 x i64> undef, i64 %value_phi30.lcssa, i32 0
%v.i149 = shufflevector <8 x i64> %ie.i148, <8 x i64> undef, <8 x i32> zeroinitializer
%res.i147 = add nsw <8 x i64> %v.i149, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%res.i146 = add nsw <8 x i64> %res.i147, %res.i150
%ptr.i143 = inttoptr i64 %25 to double*
%offsetptr.i144 = getelementptr inbounds double, double* %ptr.i143, <8 x i64> %res.i146
%res.i142 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %offsetptr.i144, i32 8, <8 x i1> %mask.i155, <8 x double> zeroinitializer)
switch i8 %tindex_phi41.lcssa, label %L287 [
i8 2, label %L293
i8 1, label %L299
]
L287: ; preds = %L249
call void @jl_throw(%jl_value_t addrspace(12)* addrspacecast (%jl_value_t* inttoptr (i64 140677114703680 to %jl_value_t*) to %jl_value_t addrspace(12)*))
unreachable
L293: ; preds = %L249
%value_phi49294 = fmul reassoc nnan ninf nsz arcp contract <8 x double> %value_phi37.lcssa, %res.i142
%res.i136 = select <8 x i1> %mask.i155, <8 x double> %value_phi49294, <8 x double> %value_phi37.lcssa
br label %L310.thread
L299: ; preds = %L249
%v.i139 = shufflevector <8 x double> %.sroa.089.0.lcssa, <8 x double> undef, <8 x i32> zeroinitializer
%value_phi49 = fmul reassoc nnan ninf nsz arcp contract <8 x double> %v.i139, %res.i142
%res.i134 = select <8 x i1> %mask.i155, <8 x double> %value_phi49, <8 x double> %v.i139
br label %L310.thread
L310.thread: ; preds = %L299, %L293, %L247.L310_crit_edge
%value_phi52.ph = phi <8 x double> [ %res.i136, %L293 ], [ %res.i134, %L299 ], [ %value_phi37.lcssa, %L247.L310_crit_edge ]
%ptr.i131299 = inttoptr i64 %14 to double*
%res.i130301 = load double, double* %ptr.i131299, align 1
%vec_4_1.i = shufflevector <8 x double> %value_phi52.ph, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%vec_4_2.i = shufflevector <8 x double> %value_phi52.ph, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vec_4.i = fmul <4 x double> %vec_4_1.i, %vec_4_2.i
%vec_2_1.i = shufflevector <4 x double> %vec_4.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
%vec_2_2.i = shufflevector <4 x double> %vec_4.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
%vec_2.i = fmul <2 x double> %vec_2_1.i, %vec_2_2.i
%vec_1_1.i = shufflevector <2 x double> %vec_2.i, <2 x double> undef, <1 x i32> zeroinitializer
%vec_1_2.i = shufflevector <2 x double> %vec_2.i, <2 x double> undef, <1 x i32> <i32 1>
%vec_1.i = fmul <1 x double> %vec_1_1.i, %vec_1_2.i
%res.i = extractelement <1 x double> %vec_1.i, i32 0
%53 = fmul fast double %res.i, %res.i130301
br label %L329
L310: ; preds = %L247.L310_crit_edge
call void @jl_throw(%jl_value_t addrspace(12)* addrspacecast (%jl_value_t* inttoptr (i64 140677114703680 to %jl_value_t*) to %jl_value_t addrspace(12)*))
unreachable
L329: ; preds = %L322.thread, %L310.thread
%value_phi55 = phi double [ %53, %L310.thread ], [ %56, %L322.thread ]
%54 = fadd double %value_phi55, 0.000000e+00
br label %L331
L331: ; preds = %L175, %L175.us, %L179, %L329
%value_phi56 = phi double [ %54, %L329 ], [ 0.000000e+00, %L179 ], [ %40, %L175.us ], [ %45, %L175 ]
ret double %value_phi56
L322.thread: ; preds = %L247.L310_crit_edge
%ptr.i131308 = inttoptr i64 %14 to double*
%res.i130310 = load double, double* %ptr.i131308, align 1
%55 = extractelement <8 x double> %.sroa.089.0.lcssa, i32 0
%56 = fmul fast double %res.i130310, %55
br label %L329
}
specifically, look for the loop body:
L44.us: ; preds = %L86.us, %L44.lr.ph.us
%tindex_phi348.us = phi i8 [ 1, %L44.lr.ph.us ], [ 2, %L86.us ]
%value_phi9347.us = phi <8 x double> [ undef, %L44.lr.ph.us ], [ %.sroa.0113.0.us, %L86.us ]
%value_phi2346.us = phi i64 [ 0, %L44.lr.ph.us ], [ %34, %L86.us ]
%.sroa.0117.1345.us = phi <8 x double> [ %.sroa.0117.0.vec.insert.us, %L44.lr.ph.us ], [ %.sroa.0113.0.us, %L86.us ]
%33 = add i64 %value_phi2346.us, %32
%offsetptr.i.us = getelementptr inbounds i64, i64* %ptr.i, i64 %33
%ptr.i244.us = bitcast i64* %offsetptr.i.us to <8 x i64>*
%res.i245.us = load <8 x i64>, <8 x i64>* %ptr.i244.us, align 8
%res.i243.us = add nsw <8 x i64> %res.i245.us, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
%res.i240.us = mul nsw <8 x i64> %res.i243.us, %v.i242
%ie.i238.us = insertelement <8 x i64> undef, i64 %value_phi2346.us, i32 0
%v.i239.us = shufflevector <8 x i64> %ie.i238.us, <8 x i64> undef, <8 x i32> zeroinitializer
%res.i237.us = add nsw <8 x i64> %v.i239.us, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%res.i236.us = add nsw <8 x i64> %res.i237.us, %res.i240.us
%offsetptr.i234.us = getelementptr inbounds double, double* %ptr.i233, <8 x i64> %res.i236.us
%res.i232.us = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %offsetptr.i234.us, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef)
switch i8 %tindex_phi348.us, label %L84 [
i8 2, label %L86.us
i8 1, label %L77.us
]
L77.us: ; preds = %L44.us
%v.i229.us = shufflevector <8 x double> %.sroa.0117.1345.us, <8 x double> undef, <8 x i32> zeroinitializer
br label %L86.us
L86.us: ; preds = %L77.us, %L44.us
%v.i229.pn.us = phi <8 x double> [ %v.i229.us, %L77.us ], [ %value_phi9347.us, %L44.us ]
%.sroa.0113.0.us = fmul reassoc nnan ninf nsz arcp contract <8 x double> %res.i232.us, %v.i229.pn.us
%34 = add i64 %value_phi2346.us, 8
%35 = icmp slt i64 %34, %30
br i1 %35, label %L44.us, label %L93.us