Hello,

The problem with the BitMatrix approach is a bunch of execution time that is won in `clousure`

is moved to BitMatrix creation, right? You can run a fast closure still using the SparseCSC matrix.

If you do this and what I mentioned in the other post to fastly compute the sum over the cols you should get a decent speedup. If there is not other place where BitMatrix is used probably you can avoid going from SparseCSC to BitMatrix. I can do a PR with both ideas to the package.

I think to iterate and facilitate improving it the package might benefit form some script containing data generated automatically and @btime/@benchmark of important function calls.

```
using SparseArrays, BenchmarkTools
function closure(matrix::AbstractMatrix, itemset::Vector{Int})
row_mask = vec(all(view(matrix,:, itemset), dims=2))
return findall(vec(all(view(matrix, row_mask, :), dims=1)))
end
function find_complete_rows(X_csc, col_indices)
if hasproperty(X_csc, :parent)
X_csc = X_csc.parent
end
n_rows, n_cols = size(X_csc,1), length(col_indices)
counts = zeros(Int64, n_rows);
for (j,c) in enumerate(col_indices[1:end])
start_ind = X_csc.colptr[c]
end_index = X_csc.colptr[c + 1] -1
for i in start_ind:end_index
counts[X_csc.rowval[i]] += 1
end
end
result = []
for (i,c) in enumerate(counts)
if c == n_cols
push!(result, i)
end
end
return result
end
function closure2(m, cols, itemset)
col(c) = @view rowvals(m)[nzrange(m, c)]
rows = reduce(intersect, (Set(col(c)) for c in cols[itemset]))
[c for c in cols if all(insorted(x, col(c)) for x in rows)]
end
function closure_optimized(matrix::AbstractMatrix, itemset::Vector{Int})
row_mask = find_complete_rows(matrix, itemset)
return findall(vec(all(view(matrix, row_mask, :), dims=1)))
end
m = sprand(Bool, 100_000, 100, 0.15)
v = view(m,:,1:30)
col_indices = [1,2,4,5,6,7,8,9,10,11,12];
@assert closure_optimized(SparseMatrixCSC(v), col_indices) == closure(b, col_indices)
@assert closure_optimized(v, col_indices) == closure(b, col_indices)
println("\nTime closure with view of SparseMatrixCSC")
@btime closure(v, col_indices);
println("\nTime closure with view BitMatrix")
@btime b = BitMatrix(v);
@btime closure(b, col_indices)
println("\nTime closure2")
@btime closure_optimized(m, 1:30, col_indices);
println("\nTime closure_optimized with SparseMatrixCSC")
# we want to avoid instanciating SparseMatrixCSC(v)
@btime closure_optimized(SparseMatrixCSC(v), col_indices);
println("\nTime closure_optimized with view of SparseMatrixCSC")
@btime closure_optimized(v, col_indices);
```

This will print

```
Time closure with view of SparseMatrixCSC
97.138 ms (17 allocations: 880.62 KiB)
Time closure with view BitMatrix
82.914 ms (3 allocations: 366.34 KiB)
3.007 ms (14 allocations: 13.28 KiB)
Time closure2
2.066 ms (151 allocations: 3.17 MiB)
Time closure_optimized with SparseMatrixCSC
343.916 μs (16 allocations: 4.63 MiB)
Time closure_optimized with view of SparseMatrixCSC
234.833 μs (10 allocations: 782.50 KiB)
```

Making the closure much faster. This should go from 85 ms (time to convert to Bitmatrix + time to call closure) to 234.833 μs this is a 362x speedup ?!