Try counting the nonzero entries first.
That is, show us the output of
@views function sparse_distance_count(mat, df=jaccard)
lk = ReentrantLock()
counter_all = Ref(0)
@sync for ic in chunks(1:size(mat,2); n=Threads.nthreads())
@spawn begin
counter = 0
for i in ic
for j in axes(mat, 2)
i >= j && continue
j % 100_000 == 0 && @info i, j
d = df(mat[:, i], mat[:, j])
d == 1 && continue
counter += 1
end
end
lock(lk) do
counter_all[] += counter
end
end
end
return nnz(mat), counter_all[]
end
together with info whether the timing is acceptable to you, and how much RAM you have on your laptop.
You can definitely stream the output to disk. To do that, you simply need to write a file once you have enough data for it. For example, you could do
@views function sparse_distance(mat, dist_func=jaccard)
@sync for ic in chunks(1:size(mat,2); n=round(Int, size(mat,2)/1000))
suffix = 1
filename = "dmats/chunk_$ic_$suffix.csv"
isfile(filename) && continue
@spawn begin
is = Int[]; js = Int[]; vs = Float64[]
for i in ic
for j in axes(mat, 2)
i >= j && continue
j % 100_000 == 0 && @info i,j
d = dist_func(mat[:, i], mat[:, j])
d == 1. && continue
push!(is, i)
push!(js, j)
push!(vs, 1. - d)
if length(is) > 10_000_000
CSV.write(filename, (; i=is, j=js, v=vs))
empty!(is); empty!(js); empty!(vs);
suffix += 1
filename = "dmats/chunk_$ic_$suffix.csv"
end
end
end
isempty(is) || CSV.write(filename, (; i=is, j=js, v=vs))
end
end
return nothing
end
Even better would be to use a streaming interface of CSV.jl, but I don’t think that exists.
Not quite; per Distances.jl Readme.md, it’s supposed to compute 1 - sum(min(x, y)) / sum(max(x, y)); and it actually does compute that if all entries are nonnegative (they are nonnegative, right?).
Jaccard should be exactly one if and only if the two vectors have disjoint support. And by my superficial reading of Distances.jl that should also hold for floating point, and the compiler should not have leeway to break that property by re-associating; so exact comparison to 1 should be correct, unless somebody sneaked in a dirty @simd or @fastmath or something like that. (floating point does guarantee that x-x==0.0 and x+0 == x exactly, and that’s all we should need)