Why is this Julia code considerably slower than Matlab

Using Yeppp and devectorization I got it 3 x faster than my initial implementation and almost 5 x faster when parallelized:

@benchmark Testing.performance_test()
BenchmarkTools.Trial: 
  memory estimate:  534.06 MiB
  allocs estimate:  69
  --------------
  minimum time:     1.241 s (9.83% GC)
  median time:      1.246 s (9.52% GC)
  mean time:        1.289 s (12.10% GC)
  maximum time:     1.421 s (19.32% GC)
  --------------
  samples:          4
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

@benchmark Testing.performance_test_parallel()
BenchmarkTools.Trial: 
  memory estimate:  198.43 MiB
  allocs estimate:  2075
  --------------
  minimum time:     417.643 ms (26.01% GC)
  median time:      893.489 ms (11.51% GC)
  mean time:        826.470 ms (12.60% GC)
  maximum time:     921.135 ms (11.17% GC)
  --------------
  samples:          7
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

Here is the code:

module Testing

  using Yeppp;

  function performance_test()
    N = 2_000_000;
    range = collect(1:N);

    steering_vectors = complex(ones(4,11), ones(4,11));
    carrier_signal = zeros(Complex{Float64}, N);
    cos_sig, sin_sig = zeros(N), zeros(N);
    sum_signal = zeros(Complex{Float64}, 4, length(range));
    for i = 1:11
        arg = (2π * 1.023e6 / 4e6) .* range .+ (40π/180);
        Yeppp.sin!(sin_sig, arg);
        Yeppp.cos!(cos_sig, arg);
        carrier_signal .= complex.(cos_sig, sin_sig);
        @inbounds for j = 1:4, k = 1:N ; sum_signal[j, k] += steering_vectors[j, i] * carrier_signal[k]; end
    end
    return sum_signal;
  end

  function performance_test_parallel()
    N = 2_000_000;
    range = collect(1:N);

    steering_vectors = complex(ones(4,11), ones(4,11));
    carrier_signal = zeros(Complex{Float64}, N);
    cos_sig, sin_sig = zeros(N), zeros(N);
    sum_signal_temp = zeros(Complex{Float64}, 4, length(range));
    sum_signal = convert(SharedArray, sum_signal_temp);
    @parallel for i = 1:11
        arg = (2π * 1.023e6 / 4e6) .* range .+ (40π/180);
        Yeppp.sin!(sin_sig, arg);
        Yeppp.cos!(cos_sig, arg);
        carrier_signal .= complex.(cos_sig, sin_sig);
        @inbounds for j = 1:4, k = 1:N ; sum_signal[j, k] += steering_vectors[j, i] * carrier_signal[k]; end
    end
    return sum_signal;
  end
end

Yeppp seems not that fast as Apple Accelerate. Without Yeppp the unparallelized code takes 2.2s instead of 1.2s.