I see the vectorization instructions for vsum(
Array)
but no instructions for vsum(
Vector)
.
For reference:
julia> @code_llvm vsum(rand(10))
; Function vsum
; Location: REPL[32]:2
define double @julia_vsum_36461(%jl_value_t addrspace(10)* nonnull dereferenceable(40)) {
top:
; Location: REPL[32]:3
; Function macro expansion; {
; Location: simdloop.jl:65
; Function length; {
; Location: array.jl:174
%1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
%3 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 1
%4 = load i64, i64 addrspace(11)* %3, align 8
;}
; Function Colon; {
; Location: range.jl:5
; Function Type; {
; Location: range.jl:185
; Function unitrange_last; {
; Location: range.jl:190
; Function >=; {
; Location: operators.jl:330
; Function <=; {
; Location: int.jl:419
%5 = icmp sgt i64 %4, 0
;}}
%6 = select i1 %5, i64 %4, i64 0
;}}}
; Location: simdloop.jl:67
; Function simd_inner_length; {
; Location: simdloop.jl:47
; Function _length; {
; Location: abstractarray.jl:157
; Function axes; {
; Location: abstractarray.jl:83
; Function size; {
; Location: range.jl:350
; Function length; {
; Location: range.jl:411
; Function checked_sub; {
; Location: checked.jl:226
; Function sub_with_overflow; {
; Location: checked.jl:198
%7 = add nsw i64 %6, -1
;}}
; Function checked_add; {
; Location: checked.jl:169
; Function add_with_overflow; {
; Location: checked.jl:136
%8 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %7, i64 1)
%9 = extractvalue { i64, i1 } %8, 1
;}
; Location: checked.jl:170
br i1 %9, label %L28, label %L33.lr.ph
L33.lr.ph: ; preds = %top
; Location: checked.jl:169
; Function add_with_overflow; {
; Location: checked.jl:136
%10 = extractvalue { i64, i1 } %8, 0
%11 = icmp sgt i64 %10, 0
;}
; Location: checked.jl:170
br i1 %11, label %L45.lr.ph.us.us, label %L100
L45.lr.ph.us.us: ; preds = %L33.lr.ph
%12 = bitcast %jl_value_t addrspace(11)* %1 to double* addrspace(11)*
;}}}}}}
; Location: simdloop.jl:71
br label %L61.us.us
L61.us.us: ; preds = %L45.lr.ph.us.us, %L61.us.us
%value_phi664.us.us = phi i64 [ 0, %L45.lr.ph.us.us ], [ %17, %L61.us.us ]
%value_phi563.us.us = phi double [ 0.000000e+00, %L45.lr.ph.us.us ], [ %16, %L61.us.us ]
; Location: simdloop.jl:73
; Function macro expansion; {
; Location: REPL[32]:4
; Function getindex; {
; Location: array.jl:707
%13 = load double*, double* addrspace(11)* %12, align 8
%14 = getelementptr double, double* %13, i64 %value_phi664.us.us
%15 = load double, double* %14, align 8
;}
; Function +; {
; Location: float.jl:394
%16 = fadd fast double %value_phi563.us.us, %15
;}}
; Location: simdloop.jl:74
; Function +; {
; Location: int.jl:53
%17 = add nuw nsw i64 %value_phi664.us.us, 1
;}
; Location: simdloop.jl:71
; Function <; {
; Location: int.jl:49
%18 = icmp ult i64 %17, %10
;}
br i1 %18, label %L61.us.us, label %L100
L28: ; preds = %top
; Location: simdloop.jl:67
; Function simd_inner_length; {
; Location: simdloop.jl:47
; Function _length; {
; Location: abstractarray.jl:157
; Function axes; {
; Location: abstractarray.jl:83
; Function size; {
; Location: range.jl:350
; Function length; {
; Location: range.jl:411
; Function checked_add; {
; Location: checked.jl:170
call void @julia_throw_overflowerr_binaryop_24633(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 47979298915040 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 %7, i64 1)
call void @llvm.trap()
unreachable
L100: ; preds = %L61.us.us, %L33.lr.ph
%value_phi10.lcssa = phi double [ 0.000000e+00, %L33.lr.ph ], [ %16, %L61.us.us ]
;}}}}}}}
; Location: REPL[32]:6
ret double %value_phi10.lcssa
julia> @code_llvm vsum(rand(10,1))
; Function vsum
; Location: REPL[32]:2
define double @julia_vsum_36468(%jl_value_t addrspace(10)* nonnull dereferenceable(40)) {
top:
%1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
%3 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 1
%4 = load i64, i64 addrspace(11)* %3, align 8
; Location: REPL[32]:3
; Function macro expansion; {
; Location: simdloop.jl:65
; Function Colon; {
; Location: range.jl:5
; Function Type; {
; Location: range.jl:185
; Function unitrange_last; {
; Location: range.jl:190
; Function >=; {
; Location: operators.jl:330
; Function <=; {
; Location: int.jl:419
%5 = icmp sgt i64 %4, 0
;}}
%6 = select i1 %5, i64 %4, i64 0
;}}}
; Location: simdloop.jl:67
; Function simd_inner_length; {
; Location: simdloop.jl:47
; Function _length; {
; Location: abstractarray.jl:157
; Function axes; {
; Location: abstractarray.jl:83
; Function size; {
; Location: range.jl:350
; Function length; {
; Location: range.jl:411
; Function checked_sub; {
; Location: checked.jl:226
; Function sub_with_overflow; {
; Location: checked.jl:198
%7 = add nsw i64 %6, -1
;}}
; Function checked_add; {
; Location: checked.jl:169
; Function add_with_overflow; {
; Location: checked.jl:136
%8 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %7, i64 1)
%9 = extractvalue { i64, i1 } %8, 1
;}
; Location: checked.jl:170
br i1 %9, label %L28, label %L33.lr.ph
L33.lr.ph: ; preds = %top
; Location: checked.jl:169
; Function add_with_overflow; {
; Location: checked.jl:136
%10 = extractvalue { i64, i1 } %8, 0
%11 = icmp sgt i64 %10, 0
;}
; Location: checked.jl:170
br i1 %11, label %L45.lr.ph.us.us, label %L100
L45.lr.ph.us.us: ; preds = %L33.lr.ph
%12 = bitcast %jl_value_t addrspace(11)* %1 to double* addrspace(11)*
%13 = load double*, double* addrspace(11)* %12, align 8
;}}}}}}
; Location: simdloop.jl:71
%min.iters.check = icmp ult i64 %6, 16
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L45.lr.ph.us.us
%n.vec = and i64 %6, 9223372036854775792
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
; Location: simdloop.jl:74
; Function +; {
; Location: int.jl:53
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %22, %vector.body ]
%vec.phi91 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %23, %vector.body ]
%vec.phi92 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %24, %vector.body ]
%vec.phi93 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %25, %vector.body ]
;}
; Location: simdloop.jl:73
; Function macro expansion; {
; Location: REPL[32]:4
; Function getindex; {
; Location: array.jl:707
%14 = getelementptr double, double* %13, i64 %index
%15 = bitcast double* %14 to <4 x double>*
%wide.load = load <4 x double>, <4 x double>* %15, align 8
%16 = getelementptr double, double* %14, i64 4
%17 = bitcast double* %16 to <4 x double>*
%wide.load94 = load <4 x double>, <4 x double>* %17, align 8
%18 = getelementptr double, double* %14, i64 8
%19 = bitcast double* %18 to <4 x double>*
%wide.load95 = load <4 x double>, <4 x double>* %19, align 8
%20 = getelementptr double, double* %14, i64 12
%21 = bitcast double* %20 to <4 x double>*
%wide.load96 = load <4 x double>, <4 x double>* %21, align 8
;}
; Function +; {
; Location: float.jl:394
%22 = fadd fast <4 x double> %vec.phi, %wide.load
%23 = fadd fast <4 x double> %vec.phi91, %wide.load94
%24 = fadd fast <4 x double> %vec.phi92, %wide.load95
%25 = fadd fast <4 x double> %vec.phi93, %wide.load96
;}}
; Location: simdloop.jl:74
; Function +; {
; Location: int.jl:53
%index.next = add i64 %index, 16
%26 = icmp eq i64 %index.next, %n.vec
br i1 %26, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
;}
; Location: simdloop.jl:73
; Function macro expansion; {
; Location: REPL[32]:4
; Function +; {
; Location: float.jl:394
%bin.rdx = fadd fast <4 x double> %23, %22
%bin.rdx97 = fadd fast <4 x double> %24, %bin.rdx
%bin.rdx98 = fadd fast <4 x double> %25, %bin.rdx97
%rdx.shuf = shufflevector <4 x double> %bin.rdx98, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx99 = fadd fast <4 x double> %bin.rdx98, %rdx.shuf
%rdx.shuf100 = shufflevector <4 x double> %bin.rdx99, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx101 = fadd fast <4 x double> %bin.rdx99, %rdx.shuf100
%27 = extractelement <4 x double> %bin.rdx101, i32 0
%cmp.n = icmp eq i64 %6, %n.vec
;}}
; Location: simdloop.jl:71
br i1 %cmp.n, label %L100, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L45.lr.ph.us.us
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L45.lr.ph.us.us ]
%bc.merge.rdx = phi double [ %27, %middle.block ], [ 0.000000e+00, %L45.lr.ph.us.us ]
br label %L61.us.us
L61.us.us: ; preds = %scalar.ph, %L61.us.us
%value_phi665.us.us = phi i64 [ %bc.resume.val, %scalar.ph ], [ %31, %L61.us.us ]
%value_phi564.us.us = phi double [ %bc.merge.rdx, %scalar.ph ], [ %30, %L61.us.us ]
; Location: simdloop.jl:73
; Function macro expansion; {
; Location: REPL[32]:4
; Function getindex; {
; Location: array.jl:707
%28 = getelementptr double, double* %13, i64 %value_phi665.us.us
%29 = load double, double* %28, align 8
;}
; Function +; {
; Location: float.jl:394
%30 = fadd fast double %value_phi564.us.us, %29
;}}
; Location: simdloop.jl:74
; Function +; {
; Location: int.jl:53
%31 = add nuw nsw i64 %value_phi665.us.us, 1
;}
; Location: simdloop.jl:71
; Function <; {
; Location: int.jl:49
%32 = icmp ult i64 %31, %10
;}
br i1 %32, label %L61.us.us, label %L100
L28: ; preds = %top
; Location: simdloop.jl:67
; Function simd_inner_length; {
; Location: simdloop.jl:47
; Function _length; {
; Location: abstractarray.jl:157
; Function axes; {
; Location: abstractarray.jl:83
; Function size; {
; Location: range.jl:350
; Function length; {
; Location: range.jl:411
; Function checked_add; {
; Location: checked.jl:170
call void @julia_throw_overflowerr_binaryop_24633(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 47979298915040 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 %7, i64 1)
call void @llvm.trap()
unreachable
L100: ; preds = %L61.us.us, %middle.block, %L33.lr.ph
%value_phi10.lcssa = phi double [ 0.000000e+00, %L33.lr.ph ], [ %30, %L61.us.us ], [ %27, %middle.block ]
;}}}}}}}
; Location: REPL[32]:6
ret double %value_phi10.lcssa