@DNF I get the output below with the Julia 0.6 I compiled from source (for a Skylake CPU). But with the generic binaries no vectorization happens, which means I guess that only recent instructions can give a performance improvement in that case (and they are not enabled on generic builds). That’s still surprising since I would have expected at least that SSE/SSE2 instructions could be used, as they are available on all x86_64 CPUs…
julia> function square!(c::Array, x::Array)
@assert size(c) == size(x)
@simd for i in eachindex(x)
@inbounds c[i] = x[i] * x[i]
end
end
square! (generic function with 1 method)
julia> X = rand(Int, 1000);
julia> Y = rand(Int, 1000);
julia> @code_llvm square!(X, Y)
define void @"julia_square!_68142"(i8**, i8**) #0 !dbg !5 {
...
min.iters.checked: ; preds = %if4.lr.ph.us11
%n.vec = and i64 %36, -16
%cmp.zero = icmp eq i64 %n.vec, 0
br i1 %cmp.zero, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %min.iters.checked
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%42 = getelementptr i64, i64* %40, i64 %index
%43 = bitcast i64* %42 to <4 x i64>*
%wide.load = load <4 x i64>, <4 x i64>* %43, align 8
%44 = getelementptr i64, i64* %42, i64 4
%45 = bitcast i64* %44 to <4 x i64>*
%wide.load20 = load <4 x i64>, <4 x i64>* %45, align 8
%46 = getelementptr i64, i64* %42, i64 8
%47 = bitcast i64* %46 to <4 x i64>*
%wide.load21 = load <4 x i64>, <4 x i64>* %47, align 8
%48 = getelementptr i64, i64* %42, i64 12
%49 = bitcast i64* %48 to <4 x i64>*
%wide.load22 = load <4 x i64>, <4 x i64>* %49, align 8
%50 = mul <4 x i64> %wide.load, %wide.load
%51 = mul <4 x i64> %wide.load20, %wide.load20
%52 = mul <4 x i64> %wide.load21, %wide.load21
%53 = mul <4 x i64> %wide.load22, %wide.load22
%54 = getelementptr i64, i64* %41, i64 %index
%55 = bitcast i64* %54 to <4 x i64>*
store <4 x i64> %50, <4 x i64>* %55, align 8
%56 = getelementptr i64, i64* %54, i64 4
%57 = bitcast i64* %56 to <4 x i64>*
store <4 x i64> %51, <4 x i64>* %57, align 8
%58 = getelementptr i64, i64* %54, i64 8
%59 = bitcast i64* %58 to <4 x i64>*
store <4 x i64> %52, <4 x i64>* %59, align 8
%60 = getelementptr i64, i64* %54, i64 12
%61 = bitcast i64* %60 to <4 x i64>*
store <4 x i64> %53, <4 x i64>* %61, align 8
%index.next = add i64 %index, 16
%62 = icmp eq i64 %index.next, %n.vec
br i1 %62, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %36, %n.vec
br i1 %cmp.n, label %L100.loopexit, label %scalar.ph
...
}
@mcovalt Yes, Julia relies on LLVM to enable SIMD instructions.