Here is the full @code_llvm
for x=rand(Float32, 800)
on Julia 1.9.0-rc1
function test(x)
N = 8
c = Vec{N, Float32}(0)
lane = VecRange{N}(0)
@inbounds @fastmath for i in 1:N:length(x)
c += x[lane + i]
end
return sum(c)
end
; @ h:\Code\Renderer\simdVload.jl:3 within `test`
; Function Attrs: uwtable
define float @julia_test_3492({}* noundef nonnull align 16 dereferenceable(40) %0) #0 {
top:
; @ h:\Code\Renderer\simdVload.jl:9 within `test`
; β @ essentials.jl:10 within `length`
%1 = bitcast {}* %0 to { i8*, i64, i16, i16, i32 }*
%2 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %1, i64 0, i32 1
%3 = load i64, i64* %2, align 8
; β
; β @ range.jl:22 within `Colon`
; ββ @ range.jl:24 within `_colon`
; βββ @ range.jl:373 within `StepRange` @ range.jl:320
%4 = call i64 @j_steprange_last_3495(i64 signext 1, i64 signext 8, i64 signext %3) #0
; βββ
; β @ range.jl:887 within `iterate`
; ββ @ range.jl:659 within `isempty`
; βββ @ bool.jl:38 within `&`
%5 = icmp slt i64 %4, 1
; βββ
br i1 %5, label %L84, label %L18.preheader
L18.preheader: ; preds = %top
%6 = bitcast {}* %0 to i8**
%7 = load i8*, i8** %6, align 8
; @ h:\Code\Renderer\simdVload.jl:11 within `test`
br label %L18
L18: ; preds = %L18, %L18.preheader
%value_phi3 = phi i64 [ %12, %L18 ], [ 1, %L18.preheader ]
%value_phi5 = phi <8 x float> [ %11, %L18 ], [ zeroinitializer, %L18.preheader ]
; @ h:\Code\Renderer\simdVload.jl:10 within `test`
; β @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:302 within `getindex`
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:286 within `_pointer`
; βββ @ abstractarray.jl:1243 within `pointer`
; ββββ @ abstractarray.jl:1247 within `_memory_offset`
; βββββ @ int.jl:88 within `*`
%8 = shl i64 %value_phi3, 2
%9 = add nsw i64 %8, -4
; βββββ
; ββββ @ pointer.jl:167 within `+`
%10 = getelementptr i8, i8* %7, i64 %9
; ββββ
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:49 within `vload` @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:49 @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:50
; βββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:462 within `load`
; ββββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:471 within `macro expansion`
%ptr.i = bitcast i8* %10 to <8 x float>*
%res.i16 = load <8 x float>, <8 x float>* %ptr.i, align 4
; ββββ
; β @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\simdvec.jl:259 within `add_fast`
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:212 within `fadd`
; βββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:221 within `macro expansion`
%11 = fadd fast <8 x float> %res.i16, %value_phi5
; βββ
; @ h:\Code\Renderer\simdVload.jl:11 within `test`
; β @ range.jl:891 within `iterate`
; ββ @ promotion.jl:499 within `==`
%.not = icmp eq i64 %value_phi3, %4
; ββ
%12 = add i64 %value_phi3, 8
; β
br i1 %.not, label %L84, label %L18
L84: ; preds = %L18, %top
%value_phi10 = phi <8 x float> [ zeroinitializer, %top ], [ %11, %L18 ]
; @ h:\Code\Renderer\simdVload.jl:12 within `test`
; β @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\simdvec.jl:475 within `sum`
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:826 within `reduce_fadd`
; βββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:842 within `macro expansion`
%res.i = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> %value_phi10)
; βββ
ret float %res.i
}
function test4(x)
N = 8
c = Vec{N, Float32}(0)
lane = VecRange{N}(0)
i = 1
@inbounds @fastmath while i β€ length(x)
c += x[lane + i]
i += N
end
return sum(c)
end
; @ h:\Code\Renderer\simdVload.jl:26 within `test4`
; Function Attrs: uwtable
define float @julia_test4_3502({}* noundef nonnull align 16 dereferenceable(40) %0) #0 {
top:
; @ h:\Code\Renderer\simdVload.jl:34 within `test4`
; β @ essentials.jl:10 within `length`
%1 = bitcast {}* %0 to { i8*, i64, i16, i16, i32 }*
%2 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %1, i64 0, i32 1
%3 = load i64, i64* %2, align 8
; β
; β @ int.jl:488 within `<=`
%.not9 = icmp eq i64 %3, 0
; β
br i1 %.not9, label %L62, label %L8.lr.ph
L8.lr.ph: ; preds = %top
%4 = bitcast {}* %0 to i8**
%5 = load i8*, i8** %4, align 8
br label %L8
L8: ; preds = %L8, %L8.lr.ph
%value_phi111 = phi <8 x float> [ zeroinitializer, %L8.lr.ph ], [ %9, %L8 ]
%value_phi10 = phi i64 [ 1, %L8.lr.ph ], [ %10, %L8 ]
; @ h:\Code\Renderer\simdVload.jl:35 within `test4`
; β @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:302 within `getindex`
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:286 within `_pointer`
; βββ @ abstractarray.jl:1243 within `pointer`
; ββββ @ abstractarray.jl:1247 within `_memory_offset`
; βββββ @ int.jl:88 within `*`
%6 = shl i64 %value_phi10, 2
%7 = add i64 %6, -4
; βββββ
; ββββ @ pointer.jl:167 within `+`
%8 = getelementptr i8, i8* %5, i64 %7
; ββββ
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:49 within `vload` @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:49 @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\arrayops.jl:50
; βββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:462 within `load`
; ββββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:471 within `macro expansion`
%ptr.i = bitcast i8* %8 to <8 x float>*
%res.i8 = load <8 x float>, <8 x float>* %ptr.i, align 4
; ββββ
; β @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\simdvec.jl:259 within `add_fast`
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:212 within `fadd`
; βββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:221 within `macro expansion`
%9 = fadd fast <8 x float> %res.i8, %value_phi111
; βββ
; @ h:\Code\Renderer\simdVload.jl:36 within `test4`
; β @ fastmath.jl:270 within `add_fast`
; ββ @ int.jl:87 within `+`
%10 = add nuw nsw i64 %value_phi10, 1
; ββ
; @ h:\Code\Renderer\simdVload.jl:34 within `test4`
; β @ int.jl:488 within `<=`
%exitcond = icmp eq i64 %value_phi10, %3
; β
br i1 %exitcond, label %L62, label %L8
L62: ; preds = %L8, %top
%value_phi1.lcssa = phi <8 x float> [ zeroinitializer, %top ], [ %9, %L8 ]
; @ h:\Code\Renderer\simdVload.jl:39 within `test4`
; β @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\simdvec.jl:475 within `sum`
; ββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:826 within `reduce_fadd`
; βββ @ C:\Users\rag\.julia\packages\SIMD\7eukp\src\LLVM_intrinsics.jl:842 within `macro expansion`
%res.i = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> %value_phi1.lcssa)
; βββ
ret float %res.i
}
function test7(x)
c = 0
i = 1
while i β€ length(x)
c += x[i]
i += 1
end
return c
end
; @ h:\Code\Renderer\simdVload.jl:122 within `test7`
; Function Attrs: uwtable
define { {}*, i8 } @julia_test7_3511([8 x i8]* noalias nocapture noundef nonnull align 8 dereferenceable(8) %0, {}* noundef nonnull align 16 dereferenceable(40) %1) #0 {
top:
; @ h:\Code\Renderer\simdVload.jl:127 within `test7`
; β @ essentials.jl:10 within `length`
%2 = bitcast {}* %1 to { i8*, i64, i16, i16, i32 }*
%3 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %2, i64 0, i32 1
%4 = load i64, i64* %3, align 8
; β
; β @ int.jl:488 within `<=`
%.not23 = icmp eq i64 %4, 0
; β
br i1 %.not23, label %union_move6, label %idxend.peel
idxend.peel: ; preds = %top
%5 = bitcast {}* %1 to float**
%6 = load float*, float** %5, align 8
; @ h:\Code\Renderer\simdVload.jl:128 within `test7`
; β @ essentials.jl:13 within `getindex`
%7 = load float, float* %6, align 4
; β
; @ h:\Code\Renderer\simdVload.jl within `test7`
%value_phi5.peel = fadd float %7, 0.000000e+00
; @ h:\Code\Renderer\simdVload.jl:127 within `test7`
; β @ int.jl:488 within `<=`
%.not.peel = icmp eq i64 %4, 1
; β
br i1 %.not.peel, label %union_move, label %L9
L9: ; preds = %idxend, %idxend.peel
%value_phi25 = phi i64 [ %14, %idxend ], [ 2, %idxend.peel ]
%8 = phi float [ %value_phi5, %idxend ], [ %value_phi5.peel, %idxend.peel ]
; @ h:\Code\Renderer\simdVload.jl:128 within `test7`
; β @ essentials.jl:13 within `getindex`
%9 = add nsw i64 %value_phi25, -1
%10 = icmp ult i64 %9, %4
br i1 %10, label %idxend, label %oob
oob: ; preds = %L9
%11 = alloca i64, align 8
store i64 %value_phi25, i64* %11, align 8
call void @ijl_bounds_error_ints({}* %1, i64* nonnull %11, i64 1)
unreachable
idxend: ; preds = %L9
%12 = getelementptr inbounds float, float* %6, i64 %9
%13 = load float, float* %12, align 4
; β
; @ h:\Code\Renderer\simdVload.jl within `test7`
%value_phi5 = fadd float %8, %13
; @ h:\Code\Renderer\simdVload.jl:130 within `test7`
; β @ int.jl:87 within `+`
%14 = add nuw nsw i64 %value_phi25, 1
; β
; @ h:\Code\Renderer\simdVload.jl:127 within `test7`
; β @ int.jl:488 within `<=`
%.not.not = icmp ult i64 %value_phi25, %4
; β
br i1 %.not.not, label %L9, label %union_move
post_union_move: ; preds = %union_move6, %union_move
%tindex_phi.lcssa37 = phi i8 [ 2, %union_move6 ], [ 1, %union_move ]
; @ h:\Code\Renderer\simdVload.jl:133 within `test7`
%15 = insertvalue { {}*, i8 } { {}* null, i8 undef }, i8 %tindex_phi.lcssa37, 1
ret { {}*, i8 } %15
union_move: ; preds = %idxend, %idxend.peel
%.lcssa.in = phi float [ %value_phi5.peel, %idxend.peel ], [ %value_phi5, %idxend ]
%16 = bitcast [8 x i8]* %0 to float*
store float %.lcssa.in, float* %16, align 8
br label %post_union_move
union_move6: ; preds = %top
%.sroa_cast15 = bitcast [8 x i8]* %0 to i32*
store i32 0, i32* %.sroa_cast15, align 8
%.sroa_idx = getelementptr inbounds [8 x i8], [8 x i8]* %0, i64 0, i64 4
%.sroa_cast16 = bitcast i8* %.sroa_idx to i32*
store i32 0, i32* %.sroa_cast16, align 4
br label %post_union_move
}
function test7(x)
c = 0
i = 1
while i β€ length(x)
@inbounds c += x[i]
i += 1
end
return c
end
; @ h:\Code\Renderer\simdVload.jl:122 within `test7`
; Function Attrs: uwtable
define { {}*, i8 } @julia_test7_3517([8 x i8]* noalias nocapture noundef nonnull align 8 dereferenceable(8) %0, {}* noundef nonnull align 16 dereferenceable(40) %1) #0 {
top:
; @ h:\Code\Renderer\simdVload.jl:127 within `test7`
; β @ essentials.jl:10 within `length`
%2 = bitcast {}* %1 to { i8*, i64, i16, i16, i32 }*
%3 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %2, i64 0, i32 1
%4 = load i64, i64* %3, align 8
; β
; β @ int.jl:488 within `<=`
%.not20 = icmp eq i64 %4, 0
; β
br i1 %.not20, label %union_move6, label %L9.lr.ph
L9.lr.ph: ; preds = %top
%5 = bitcast {}* %1 to float**
%6 = load float*, float** %5, align 8
; @ h:\Code\Renderer\simdVload.jl:128 within `test7`
; β @ essentials.jl:13 within `getindex`
%7 = load float, float* %6, align 4
; β
; @ h:\Code\Renderer\simdVload.jl within `test7`
%value_phi5.peel = fadd float %7, 0.000000e+00
; @ h:\Code\Renderer\simdVload.jl:127 within `test7`
; β @ int.jl:488 within `<=`
%exitcond.peel = icmp eq i64 %4, 1
; β
br i1 %exitcond.peel, label %union_move, label %L9
L9: ; preds = %L9, %L9.lr.ph
%value_phi22 = phi i64 [ %12, %L9 ], [ 2, %L9.lr.ph ]
%8 = phi float [ %value_phi5, %L9 ], [ %value_phi5.peel, %L9.lr.ph ]
; @ h:\Code\Renderer\simdVload.jl:128 within `test7`
; β @ essentials.jl:13 within `getindex`
%9 = add nsw i64 %value_phi22, -1
%10 = getelementptr inbounds float, float* %6, i64 %9
%11 = load float, float* %10, align 4
; β
; @ h:\Code\Renderer\simdVload.jl within `test7`
%value_phi5 = fadd float %8, %11
; @ h:\Code\Renderer\simdVload.jl:130 within `test7`
; β @ int.jl:87 within `+`
%12 = add nuw nsw i64 %value_phi22, 1
; β
; @ h:\Code\Renderer\simdVload.jl:127 within `test7`
; β @ int.jl:488 within `<=`
%exitcond = icmp eq i64 %value_phi22, %4
; β
br i1 %exitcond, label %union_move, label %L9
post_union_move: ; preds = %union_move6, %union_move
%tindex_phi.lcssa31 = phi i8 [ 2, %union_move6 ], [ 1, %union_move ]
; @ h:\Code\Renderer\simdVload.jl:133 within `test7`
%13 = insertvalue { {}*, i8 } { {}* null, i8 undef }, i8 %tindex_phi.lcssa31, 1
ret { {}*, i8 } %13
union_move: ; preds = %L9, %L9.lr.ph
%.lcssa.in = phi float [ %value_phi5.peel, %L9.lr.ph ], [ %value_phi5, %L9 ]
%14 = bitcast [8 x i8]* %0 to float*
store float %.lcssa.in, float* %14, align 8
br label %post_union_move
union_move6: ; preds = %top
%.sroa_cast15 = bitcast [8 x i8]* %0 to i32*
store i32 0, i32* %.sroa_cast15, align 8
%.sroa_idx = getelementptr inbounds [8 x i8], [8 x i8]* %0, i64 0, i64 4
%.sroa_cast16 = bitcast i8* %.sroa_idx to i32*
store i32 0, i32* %.sroa_cast16, align 4
br label %post_union_move
}
function test8(x)
c = 0
for i in eachindex(x)
c += x[i]
end
return c
end
; @ h:\Code\Renderer\simdVload.jl:132 within `test8`
; Function Attrs: uwtable
define { {}*, i8 } @julia_test8_3514([8 x i8]* noalias nocapture noundef nonnull align 8 dereferenceable(8) %0, {}* noundef nonnull align 16 dereferenceable(40) %1) #0 {
top:
; @ h:\Code\Renderer\simdVload.jl:135 within `test8`
; β @ abstractarray.jl:314 within `eachindex`
; ββ @ abstractarray.jl:133 within `axes1`
; βββ @ abstractarray.jl:98 within `axes`
; ββββ @ array.jl:149 within `size`
%2 = bitcast {}* %1 to { i8*, i64, i16, i16, i32 }*
%3 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %2, i64 0, i32 1
%4 = load i64, i64* %3, align 8
; ββββ
; β @ range.jl:887 within `iterate`
; ββ @ range.jl:662 within `isempty`
; βββ @ operators.jl:369 within `>`
; ββββ @ int.jl:83 within `<`
%.not.not = icmp eq i64 %4, 0
; ββββ
br i1 %.not.not, label %union_move15, label %idxend.lr.ph
idxend.lr.ph: ; preds = %top
%5 = bitcast {}* %1 to float**
%6 = load float*, float** %5, align 8
; @ h:\Code\Renderer\simdVload.jl:136 within `test8`
; β @ essentials.jl:13 within `getindex`
br label %idxend
L43: ; preds = %idxend
; β
; @ h:\Code\Renderer\simdVload.jl:137 within `test8`
; β @ range.jl:891 within `iterate`
%7 = add nuw nsw i64 %value_phi347, 1
; β
; @ h:\Code\Renderer\simdVload.jl:135 within `test8`
; β @ range.jl:887 within `iterate`
%8 = bitcast float %value_phi9 to i32
; β
; @ h:\Code\Renderer\simdVload.jl:136 within `test8`
; β @ essentials.jl:13 within `getindex`
br label %idxend
idxend: ; preds = %L43, %idxend.lr.ph
%9 = phi i64 [ 0, %idxend.lr.ph ], [ %value_phi347, %L43 ]
%value_phi548 = phi i1 [ true, %idxend.lr.ph ], [ false, %L43 ]
%value_phi347 = phi i64 [ 1, %idxend.lr.ph ], [ %7, %L43 ]
%.sroa.030.046 = phi i32 [ 0, %idxend.lr.ph ], [ %8, %L43 ]
%10 = getelementptr inbounds float, float* %6, i64 %9
%11 = load float, float* %10, align 4
; β
%12 = bitcast i32 %.sroa.030.046 to float
%13 = uitofp i32 %.sroa.030.046 to float
%.pn = select i1 %value_phi548, float %13, float %12
; @ h:\Code\Renderer\simdVload.jl within `test8`
%value_phi9 = fadd float %.pn, %11
; @ h:\Code\Renderer\simdVload.jl:137 within `test8`
; β @ range.jl:891 within `iterate`
; ββ @ promotion.jl:499 within `==`
%.not = icmp eq i64 %value_phi347, %4
; ββ
br i1 %.not, label %union_move, label %L43
post_union_move: ; preds = %union_move15, %union_move
%tindex_phi1344 = phi i8 [ 2, %union_move15 ], [ 1, %union_move ]
; @ h:\Code\Renderer\simdVload.jl:139 within `test8`
%14 = insertvalue { {}*, i8 } { {}* null, i8 undef }, i8 %tindex_phi1344, 1
ret { {}*, i8 } %14
union_move: ; preds = %idxend
%15 = bitcast [8 x i8]* %0 to float*
store float %value_phi9, float* %15, align 8
br label %post_union_move
union_move15: ; preds = %top
%16 = bitcast [8 x i8]* %0 to i64*
store i64 0, i64* %16, align 8
br label %post_union_move
}