In another discussion,
bizarre behavior that requires further attention and explanation came up.
A recursive textbook implementation of quicksort makes no memory allocations as it modifies its argument input vector. However, the straightforward parallelization by @spawn
ing the first recursive call results in a large number and volume of memory allocations. As a result, the parallel code is slower than the sequential. The above two functions are referred to as v0_quicksort!()
and v0_p_quicksort!()
in the attached source code.
But if we remove as a separate function the partition!()
code, nothing changes with the sequential code. But now the parallel version makes substantially fewer memory allocations, and it is faster 2.3x times on a two-core i5 than the sequential code. These are the v1_quicksort!()
and v1_p_quicksort!()
, respectively.
Semantically as far as I can tell, the v0
and the v1
codes are identical. So why does the parallel version v0
allocate a few GiB of memory while the parallel v1
version only allocates a few MiB of memory for the same input?
Here is the source:
const SEQ_THRESH = 1 << 9
@inline function partition!(A, pivot, left, right)
@inbounds while left <= right
while A[left] < pivot
left += 1
end
while A[right] > pivot
right -= 1
end
if left <= right
A[left], A[right] = A[right], A[left]
left += 1
right -= 1
end
end
return (left,right)
end
function v0_quicksort!(A, i=1, j=length(A))
if j > i
pivot, left, right = A[(j+i) >>> 1], i, j
@inbounds while left <= right
while A[left] < pivot
left += 1
end
while A[right] > pivot
right -= 1
end
if left <= right
A[left], A[right] = A[right], A[left]
left += 1
right -= 1
end
end
v0_quicksort!(A,i,right)
v0_quicksort!(A,left,j)
end
return
end
function v0_p_quicksort!(A, i=1, j=length(A))
if j-i <= SEQ_THRESH
v0_quicksort!(A, i, j)
else
pivot, left, right = A[(j+i) >>> 1], i, j
@inbounds while left <= right
while A[left] < pivot
left += 1
end
while A[right] > pivot
right -= 1
end
if left <= right
A[left], A[right] = A[right], A[left]
left += 1
right -= 1
end
end
t = Threads.@spawn v0_p_quicksort!(A, i, right)
v0_p_quicksort!(A, left, j)
wait(t)
end
return
end
function v1_quicksort!(A, i=1, j=length(A))
if j > i
left, right = partition!(A, A[(j+i) >>> 1], i, j)
v1_quicksort!(A,i,right)
v1_quicksort!(A,left,j)
end
return
end
function v1_p_quicksort!(A, i=1, j=length(A))
if j-i <= SEQ_THRESH
v1_quicksort!(A, i, j)
else
left, right = partition!(A, A[(j+i) >>> 1], i, j)
t = Threads.@spawn v1_p_quicksort!(A, i, right)
v1_p_quicksort!(A, left, j)
wait(t)
end
return
end
function test(n)
b = rand(n)
g = sort(b)
for i = 1:5
print("v0 Parallel : ")
a1 = copy(b); @time v0_p_quicksort!(a1);
@assert a1 == g
print("v0 Sequential: ")
a2 = copy(b); @time v0_quicksort!(a2);
@assert a2 == g
print("v1 Parallel : ")
a1 = copy(b); @time v1_p_quicksort!(a1);
@assert a1 == g
print("v1 Sequential: ")
a2 = copy(b); @time v1_quicksort!(a2);
@assert a2 == g
println("")
end
end
test(2^22)
And here is how I run it and the output I get:
~/p/quicksort.jl> julia -tauto --project=. test_qs.jl
v0 Parallel : 3.924779 seconds (176.85 M allocations: 2.642 GiB, 19.06% gc time, 0.68% compilation time)
v0 Sequential: 0.500592 seconds
v1 Parallel : 0.217315 seconds (109.48 k allocations: 7.841 MiB, 2.58% compilation time)
v1 Sequential: 0.491915 seconds
v0 Parallel : 3.943327 seconds (176.83 M allocations: 2.641 GiB, 20.12% gc time)
v0 Sequential: 0.489989 seconds
v1 Parallel : 0.216695 seconds (108.72 k allocations: 7.800 MiB)
v1 Sequential: 0.490239 seconds
v0 Parallel : 3.732871 seconds (176.83 M allocations: 2.641 GiB, 15.50% gc time)
v0 Sequential: 0.497291 seconds
v1 Parallel : 0.214354 seconds (108.73 k allocations: 7.800 MiB)
v1 Sequential: 0.490226 seconds
v0 Parallel : 4.121204 seconds (176.83 M allocations: 2.641 GiB, 21.79% gc time)
v0 Sequential: 0.500073 seconds
v1 Parallel : 0.254187 seconds (108.78 k allocations: 7.802 MiB)
v1 Sequential: 0.733544 seconds
v0 Parallel : 4.161715 seconds (176.83 M allocations: 2.641 GiB, 15.43% gc time)
v0 Sequential: 0.507738 seconds
v1 Parallel : 0.277322 seconds (108.72 k allocations: 7.800 MiB)
v1 Sequential: 0.659662 seconds