I need to shuffle entries in selected columns
of a large, sparse matrix X
with dimensions M x N
. I have written a function that does this in-place, as I need to perform the shuffling repeatedly on the same data.
function shuffle_sparse_columns!(X::SparseMatrixCSC, columns)
M, N = size(X)
for col in columns
colptr_l = X.colptr[col]
colptr_u = X.colptr[col+1] - 1
rowvals = @view X.rowval[colptr_l:colptr_u]
nzvals = @view X.nzval[colptr_l:colptr_u]
sample!(1:M, rowvals; replace=false, ordered=true)
shuffle!(nzvals)
end
end
This is significantly faster than slicing the array with numpy/scipy, but trails behind scikit learnās implementation, shuffle
, when the size of X
increases (despite the fact that X
is automatically converted to CSR format and manually converted back to CSC).
Below is a (hidden) MWE and some benchmark results.
Load packages, define functions and benchmark
using BenchmarkTools
using PyCall
using Random
using SparseArrays
using StatsBase
"""
shuffle_columns!(X, cols)
Entries in each of the provided columns in the array are shuffled in-place.
"""
function shuffle_columns!(X, cols)
for col in cols
shuffle!(@view X[:,col])
end
end
"""
shuffle_sparse_columns!(X::SparseMatrixCSC, columns)
The nonzero entires in each of the provided columns in the array are shuffled in-place.
"""
function shuffle_sparse_columns!(X::SparseMatrixCSC, columns)
M, N = size(X)
for col in columns
colptr_l = X.colptr[col]
colptr_u = X.colptr[col+1] - 1
rowvals = @view X.rowval[colptr_l:colptr_u]
nzvals = @view X.nzval[colptr_l:colptr_u]
sample!(1:M, rowvals; replace=false, ordered=true)
shuffle!(nzvals)
end
end
py"""
import numpy as np
from sklearn.utils import shuffle
from scipy.sparse import csc_matrix, csr_matrix
def sklearn_shuffle_and_convert_csc(X):
X_shuffled = shuffle(X)
return csc_matrix(X_shuffled)
def py_shuffle_col(X, columns):
(M, N) = X.shape
for col in columns:
colptr_l = X.indptr[col]
colptr_u = X.indptr[col+1]
X.indices[colptr_l:colptr_u] = np.random.choice(M, colptr_u - colptr_l, replace=False)
X.indices[colptr_l:colptr_u].sort()
np.random.shuffle(X.data[colptr_l:colptr_u])
return X
"""
function bench(M, N, p, cols=nothing)
# Benchmark julia
Random.seed!(0);
X = sprand(M, N, p)
println("Shuffle each column of a CSC matrix in Julia")
@btime shuffle_sparse_columns!($X, $cols)
# Benchmark sklearn (CSC -> CSR -> CSC)
Random.seed!(0);
X = sprand(M, N, p)
py_sklearn_shuffle_and_convert = py"sklearn_shuffle_and_convert_csc"
X_py_csc = py"csc_matrix"(X)
println("Shuffle CSC with sklearn (to CSR) and convert to CSC")
@btime $py_sklearn_shuffle_and_convert($X_py_csc)
# Benchmark sklearn (CSC -> CSR)
py_sklearn_shuffle = py"shuffle"
X_py_csc = py"csc_matrix"(X) # Restore original matrix
println("Shuffle CSC with sklearn")
@btime $py_sklearn_shuffle($X_py_csc)
# Benchmark sklearn (CSR)
X_py_csr = py"csr_matrix"(X)
println("Shuffle CSR with sklearn")
@btime $py_sklearn_shuffle($X_py_csr)
# Benchmark numpy slicing (CSC) of rowval and nzval with 0-indexed columns
py_shuffle_col = py"py_shuffle_col"
X_py_csc = py"csc_matrix"(X)
println("Shuffle CSC with numpy slicing")
@btime $py_shuffle_col($X_py_csc, $(cols .- 1))
# Dense shuffle for comparison
Xdense = Array(X)
println("Dense shuffle")
@btime shuffle_columns!($Xdense, $cols)
end
Small `X`
M, N, p = 50, 10, 0.01;
Random.seed!(0);
X = sprand(M, N, p);
bench(M, N, p, [1:N...])
### Prints out
Shuffle each column of a CSC matrix in Julia
444.879 ns (0 allocations: 0 bytes)
Shuffle CSC with sklearn (to CSR) and convert to CSC
354.833 μs (3 allocations: 144 bytes)
Shuffle CSC with sklearn
251.791 μs (3 allocations: 144 bytes)
Shuffle CSR with sklearn
174.165 μs (3 allocations: 144 bytes)
Shuffle CSC with numpy slicing
203.838 μs (7 allocations: 448 bytes)
Dense shuffle
5.475 μs (0 allocations: 0 bytes)
Large X
:
M, N, p = 5000, 10000, 0.01;
Random.seed!(0);
X = sprand(M, N, p);
bench(M, N, p, [1:N...])
### Prints out
Shuffle each column of a CSC matrix in Julia
80.976 ms (0 allocations: 0 bytes)
Shuffle CSC with sklearn (to CSR) and convert to CSC
12.371 ms (3 allocations: 144 bytes)
Shuffle CSC with sklearn
5.398 ms (3 allocations: 144 bytes)
Shuffle CSR with sklearn
1.432 ms (3 allocations: 144 bytes)
Shuffle CSC with numpy slicing
1.020 s (7 allocations: 448 bytes)
Dense shuffle
611.732 ms (0 allocations: 0 bytes)
I have some questions:
- Is there a faster way to
sample!
from the row indices1:M
than is done inshuffle_sparse_columns!
? - Is the CSR format more efficient for
M < N
? If so, are there any suggestions on how to adaptshuffle_sparse_columns!
for a custom āSparseMatrixCSR
ā?
Slightly unrelated:
- Is there a suitable way to benchmark memory usage/allocations with PyCall?
Any suggestions on how to get the performance comparable with scikit learn are appreciated.