I think it does simd already
julia> @code_llvm test1(A,B)
; Function Signature: test1(Array{Int64, 1}, Array{Int64, 1})
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:15 within `test1`
; Function Attrs: uwtable
define void @julia_test1_42981(ptr noundef nonnull align 8 dereferenceable(24) %"x::Array", ptr noundef nonnull align 8 dereferenceable(24) %"y::Array") #0 {
top:
%"new::Tuple95" = alloca [1 x i64], align 8
%"new::Tuple103" = alloca [1 x i64], align 8
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:16 within `test1`
; β @ abstractarray.jl:321 within `eachindex`
; ββ @ abstractarray.jl:137 within `axes1`
; βββ @ abstractarray.jl:98 within `axes`
; ββββ @ array.jl:194 within `size`
%0 = getelementptr inbounds i8, ptr %"x::Array", i64 16
%.size.sroa.0.0.copyload = load i64, ptr %0, align 8
; ββββ
; β @ range.jl:904 within `iterate`
; ββ @ range.jl:681 within `isempty`
; βββ @ operators.jl:379 within `>`
; ββββ @ int.jl:83 within `<`
%1 = icmp slt i64 %.size.sroa.0.0.copyload, 1
; ββββ
br i1 %1, label %L144, label %L13.preheader
L13.preheader: ; preds = %top
%2 = load ptr, ptr %"x::Array", align 8
%3 = getelementptr inbounds i8, ptr %"y::Array", i64 16
%.size30.sroa.0.0.copyload = load i64, ptr %3, align 8
%4 = load ptr, ptr %"y::Array", align 8
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:17 within `test1`
; β @ essentials.jl:916 within `getindex`
%smin = call i64 @llvm.smin.i64(i64 %.size30.sroa.0.0.copyload, i64 0)
%5 = sub i64 %.size30.sroa.0.0.copyload, %smin
%smax = call i64 @llvm.smax.i64(i64 %smin, i64 -1)
%6 = add nsw i64 %smax, 1
%7 = mul nuw nsw i64 %5, %6
%umin = call i64 @llvm.umin.i64(i64 %.size.sroa.0.0.copyload, i64 %7)
%.not = icmp eq i64 %umin, 0
br i1 %.not, label %main.pseudo.exit, label %L129.preheader
L129.preheader: ; preds = %L13.preheader
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:20 within `test1`
%min.iters.check = icmp ult i64 %umin, 8
br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck
vector.memcheck: ; preds = %L129.preheader
%8 = shl i64 %umin, 3
%uglygep = getelementptr i8, ptr %2, i64 %8
%uglygep164 = getelementptr i8, ptr %4, i64 %8
%bound0 = icmp ult ptr %2, %uglygep164
%bound1 = icmp ult ptr %4, %uglygep
%found.conflict = and i1 %bound0, %bound1
br i1 %found.conflict, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i64 %umin, -8
%ind.end = or i64 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:17 within `test1`
; β @ essentials.jl:917 within `getindex`
%9 = getelementptr inbounds i64, ptr %2, i64 %index
%wide.load = load <4 x i64>, ptr %9, align 8
%10 = getelementptr inbounds i64, ptr %9, i64 4
%wide.load165 = load <4 x i64>, ptr %10, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:18 within `test1`
; β @ essentials.jl:917 within `getindex`
%11 = getelementptr inbounds i64, ptr %4, i64 %index
%wide.load166 = load <4 x i64>, ptr %11, align 8
%12 = getelementptr inbounds i64, ptr %11, i64 4
%wide.load167 = load <4 x i64>, ptr %12, align 8
; β
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:12 within `min__`
%13 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %wide.load, <4 x i64> %wide.load166)
%14 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %wide.load165, <4 x i64> %wide.load167)
; β
; β @ array.jl:987 within `setindex!`
store <4 x i64> %13, ptr %9, align 8
store <4 x i64> %14, ptr %10, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:19 within `test1`
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:13 within `max__`
%15 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %wide.load166, <4 x i64> %wide.load)
%16 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %wide.load167, <4 x i64> %wide.load165)
; β
; β @ array.jl:987 within `setindex!`
store <4 x i64> %15, ptr %11, align 8
store <4 x i64> %16, ptr %12, align 8
%index.next = add nuw i64 %index, 8
%17 = icmp eq i64 %index.next, %n.vec
br i1 %17, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:20 within `test1`
%cmp.n = icmp eq i64 %umin, %n.vec
br i1 %cmp.n, label %main.exit.selector, label %scalar.ph
scalar.ph: ; preds = %middle.block, %vector.memcheck, %L129.preheader
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L129.preheader ], [ 1, %vector.memcheck ]
br label %L129
L26: ; preds = %L13.postloop
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:17 within `test1`
; β @ essentials.jl:916 within `getindex`
store i64 %value_phi4.postloop, ptr %"new::Tuple103", align 8
call void @j_throw_boundserror_42997(ptr nonnull %"x::Array", ptr nocapture nonnull readonly %"new::Tuple103") #8
unreachable
L62: ; preds = %L47.postloop
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:18 within `test1`
; β @ essentials.jl:916 within `getindex`
store i64 %value_phi4.postloop, ptr %"new::Tuple95", align 8
call void @j_throw_boundserror_42997(ptr nonnull %"y::Array", ptr nocapture nonnull readonly %"new::Tuple95") #8
unreachable
L129: ; preds = %L129, %scalar.ph
%value_phi4 = phi i64 [ %24, %L129 ], [ %bc.resume.val, %scalar.ph ]
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:17 within `test1`
; β @ essentials.jl:916 within `getindex`
%18 = add nsw i64 %value_phi4, -1
; β @ essentials.jl:917 within `getindex`
%19 = getelementptr inbounds i64, ptr %2, i64 %18
%20 = load i64, ptr %19, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:18 within `test1`
; β @ essentials.jl:917 within `getindex`
%21 = getelementptr inbounds i64, ptr %4, i64 %18
%22 = load i64, ptr %21, align 8
; β
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:12 within `min__`
%value_phi41 = call i64 @llvm.smin.i64(i64 %20, i64 %22)
; β
; β @ array.jl:987 within `setindex!`
store i64 %value_phi41, ptr %19, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:19 within `test1`
; β @ essentials.jl:917 within `getindex`
%23 = load i64, ptr %21, align 8
; β
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:13 within `max__`
%value_phi68 = call i64 @llvm.smax.i64(i64 %23, i64 %20)
; β
; β @ array.jl:987 within `setindex!`
store i64 %value_phi68, ptr %21, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:20 within `test1`
; β @ range.jl:908 within `iterate`
%24 = add nuw i64 %value_phi4, 1
; β
%.not161 = icmp ult i64 %value_phi4, %umin
br i1 %.not161, label %L129, label %main.exit.selector
main.exit.selector: ; preds = %L129, %middle.block
%value_phi4.lcssa = phi i64 [ %umin, %middle.block ], [ %value_phi4, %L129 ]
; β @ range.jl:908 within `iterate`
%.lcssa = phi i64 [ %ind.end, %middle.block ], [ %24, %L129 ]
; β
%25 = icmp ult i64 %value_phi4.lcssa, %.size.sroa.0.0.copyload
br i1 %25, label %main.pseudo.exit, label %L144
main.pseudo.exit: ; preds = %main.exit.selector, %L13.preheader
%value_phi4.copy = phi i64 [ 1, %L13.preheader ], [ %.lcssa, %main.exit.selector ]
br label %L13.postloop
L144: ; preds = %L129.postloop, %main.exit.selector, %top
ret void
L13.postloop: ; preds = %L129.postloop, %main.pseudo.exit
%value_phi4.postloop = phi i64 [ %32, %L129.postloop ], [ %value_phi4.copy, %main.pseudo.exit ]
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:17 within `test1`
; β @ essentials.jl:916 within `getindex`
%26 = add i64 %value_phi4.postloop, -1
%.not.postloop = icmp ult i64 %26, %.size.sroa.0.0.copyload
br i1 %.not.postloop, label %L47.postloop, label %L26
L47.postloop: ; preds = %L13.postloop
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:18 within `test1`
; β @ essentials.jl:916 within `getindex`
%.not126.postloop = icmp ult i64 %26, %.size30.sroa.0.0.copyload
br i1 %.not126.postloop, label %L129.postloop, label %L62
L129.postloop: ; preds = %L47.postloop
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:17 within `test1`
; β @ essentials.jl:917 within `getindex`
%27 = getelementptr inbounds i64, ptr %2, i64 %26
%28 = load i64, ptr %27, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:18 within `test1`
; β @ essentials.jl:917 within `getindex`
%29 = getelementptr inbounds i64, ptr %4, i64 %26
%30 = load i64, ptr %29, align 8
; β
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:12 within `min__`
%value_phi41.postloop = call i64 @llvm.smin.i64(i64 %28, i64 %30)
; β
; β @ array.jl:987 within `setindex!`
store i64 %value_phi41.postloop, ptr %27, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:19 within `test1`
; β @ essentials.jl:917 within `getindex`
%31 = load i64, ptr %29, align 8
; β
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:13 within `max__`
%value_phi68.postloop = call i64 @llvm.smax.i64(i64 %31, i64 %28)
; β
; β @ array.jl:987 within `setindex!`
store i64 %value_phi68.postloop, ptr %29, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:20 within `test1`
; β @ range.jl:908 within `iterate`
; ββ @ promotion.jl:639 within `==`
%.not132.not.postloop = icmp eq i64 %value_phi4.postloop, %.size.sroa.0.0.copyload
; ββ
%32 = add i64 %value_phi4.postloop, 1
; β
br i1 %.not132.not.postloop, label %L144, label %L13.postloop
}
you see it in those line
%9 = getelementptr inbounds i64, ptr %2, i64 %index
%wide.load = load <4 x i64>, ptr %9, align 8
%10 = getelementptr inbounds i64, ptr %9, i64 4
%wide.load165 = load <4 x i64>, ptr %10, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:18 within `test1`
; β @ essentials.jl:917 within `getindex`
%11 = getelementptr inbounds i64, ptr %4, i64 %index
%wide.load166 = load <4 x i64>, ptr %11, align 8
%12 = getelementptr inbounds i64, ptr %11, i64 4
%wide.load167 = load <4 x i64>, ptr %12, align 8
; β
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:12 within `min__`
%13 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %wide.load, <4 x i64> %wide.load166)
%14 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %wide.load165, <4 x i64> %wide.load167)
; β
; β @ array.jl:987 within `setindex!`
store <4 x i64> %13, ptr %9, align 8
store <4 x i64> %14, ptr %10, align 8
; β
; @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:19 within `test1`
; β @ c:\Users\yolha\Desktop\juju_tests\TEST\main.jl:13 within `max__`
%15 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %wide.load166, <4 x i64> %wide.load)
%16 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %wide.load167, <4 x i64> %wide.load165)
; β
; β @ array.jl:987 within `setindex!`
store <4 x i64> %15, ptr %11, align 8
store <4 x i64> %16, ptr %12, align 8
which explains why LV wonβt help. if you find a language where this is faster than 0.13s, donβt hesitate to show it