I have made a few minor changes in your code here:

```
const SEQ_THRESH = 1 << 9
@inline function partition!(A, pivot, left,right)
@inbounds while left <= right
while A[left] < pivot
left += 1
end
while A[right] > pivot
right -= 1
end
if left <= right
A[left], A[right] = A[right], A[left]
left += 1
right -= 1
end
end
return (left,right)
end
function quicksort!(A, i=1, j=length(A))
if j > i
left, right = partition!(A, A[(j+i) >>> 1], i, j)
quicksort!(A,i,right)
quicksort!(A,left,j)
end
return
end
function parallel_quicksort!(A,i=1,j=length(A))
if j-i <= SEQ_THRESH
quicksort!(A, i, j)
return
end
left, right = partition!(A, A[(j+i) >>> 1], i, j)
t = Threads.@spawn parallel_quicksort!(A,i,right)
parallel_quicksort!(A,left,j)
wait(t)
return
end
function test(n)
b = rand(n)
for i = 1:5
print("Parallel : ")
a1 = copy(b); @time parallel_quicksort!(a1);
print("Sequential: ")
a2 = copy(b); @time quicksort!(a2);
println("")
@assert a1 == sort(a1)
@assert a1 == a2
end
end
test(2^20)
```

And I do get about 2.33x speedup on a 2-core 2013 MacBookPro with 16GB RAM.

```
~/p/quicksort.jl> julia -tauto --project=. test_qs.jl
Parallel : 0.057559 seconds (14.31 k allocations: 1.012 MiB, 7.69% compilation time)
Sequential: 0.114027 seconds
Parallel : 0.047487 seconds (13.52 k allocations: 992.812 KiB)
Sequential: 0.106023 seconds
Parallel : 0.047373 seconds (13.52 k allocations: 992.906 KiB)
Sequential: 0.106490 seconds
Parallel : 0.046449 seconds (13.48 k allocations: 991.656 KiB)
Sequential: 0.109551 seconds
Parallel : 0.045878 seconds (13.47 k allocations: 991.375 KiB)
Sequential: 0.107051 seconds
```

Perhaps if you were to undo my changes one by one, it would be very interesting to find out which aspect of your initial code caused the slowdown, (probably some type instability). Somebody wiser to help here? @Elrod @stevengj perhaps.

CHANGES:

- parallel and sequential
`quicksort()`

return nothing instead of `A`

- abstracted out
`partition!()`

and made it inline
- asserted external
`while`

block as inbounds
- timing from a
`test`

function, not the REPL