Does Julia use SIMD instructions for broadcast operations?

@DNF I get the output below with the Julia 0.6 I compiled from source (for a Skylake CPU). But with the generic binaries no vectorization happens, which means I guess that only recent instructions can give a performance improvement in that case (and they are not enabled on generic builds). That’s still surprising since I would have expected at least that SSE/SSE2 instructions could be used, as they are available on all x86_64 CPUs…

julia> function square!(c::Array, x::Array)
           @assert size(c) == size(x)
           @simd for i in eachindex(x)
               @inbounds c[i] = x[i] * x[i]
           end
       end
square! (generic function with 1 method)

julia> X = rand(Int, 1000);

julia> Y = rand(Int, 1000);

julia> @code_llvm square!(X, Y)

define void @"julia_square!_68142"(i8**, i8**) #0 !dbg !5 {
...
min.iters.checked:                                ; preds = %if4.lr.ph.us11
  %n.vec = and i64 %36, -16
  %cmp.zero = icmp eq i64 %n.vec, 0
  br i1 %cmp.zero, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %min.iters.checked
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %42 = getelementptr i64, i64* %40, i64 %index
  %43 = bitcast i64* %42 to <4 x i64>*
  %wide.load = load <4 x i64>, <4 x i64>* %43, align 8
  %44 = getelementptr i64, i64* %42, i64 4
  %45 = bitcast i64* %44 to <4 x i64>*
  %wide.load20 = load <4 x i64>, <4 x i64>* %45, align 8
  %46 = getelementptr i64, i64* %42, i64 8
  %47 = bitcast i64* %46 to <4 x i64>*
  %wide.load21 = load <4 x i64>, <4 x i64>* %47, align 8
  %48 = getelementptr i64, i64* %42, i64 12
  %49 = bitcast i64* %48 to <4 x i64>*
  %wide.load22 = load <4 x i64>, <4 x i64>* %49, align 8
  %50 = mul <4 x i64> %wide.load, %wide.load
  %51 = mul <4 x i64> %wide.load20, %wide.load20
  %52 = mul <4 x i64> %wide.load21, %wide.load21
  %53 = mul <4 x i64> %wide.load22, %wide.load22
  %54 = getelementptr i64, i64* %41, i64 %index
  %55 = bitcast i64* %54 to <4 x i64>*
  store <4 x i64> %50, <4 x i64>* %55, align 8
  %56 = getelementptr i64, i64* %54, i64 4
  %57 = bitcast i64* %56 to <4 x i64>*
  store <4 x i64> %51, <4 x i64>* %57, align 8
  %58 = getelementptr i64, i64* %54, i64 8
  %59 = bitcast i64* %58 to <4 x i64>*
  store <4 x i64> %52, <4 x i64>* %59, align 8
  %60 = getelementptr i64, i64* %54, i64 12
  %61 = bitcast i64* %60 to <4 x i64>*
  store <4 x i64> %53, <4 x i64>* %61, align 8
  %index.next = add i64 %index, 16
  %62 = icmp eq i64 %index.next, %n.vec
  br i1 %62, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %36, %n.vec
  br i1 %cmp.n, label %L100.loopexit, label %scalar.ph
...
}

@mcovalt Yes, Julia relies on LLVM to enable SIMD instructions.