Hi all, I’m trying to match (or ideally beat) the performance of a Python/C library (anonlink) that computes the Sørensen–Dice coefficient but I’m struggling to match the performance - except when calling the same C code to carry out the inner loop.

The end goal is an efficient cuda implementation of this rather simple computation, but I figured if I can’t get close to matching the C speed I’m probably missing something fundamental.

My fastest Julia version is ~20 times slower than C speed:

```
using StatsBase
using SparseArrays
popcount(ba::BitVector) = count(ba)
popcount(bitvectors::Vector{BitVector}) = count.(bitvectors)
and(b1::BitVector, b2::BitVector) = b1 .& b2
function inner_dice_cmp(b1::BitVector, b2::BitVector, popcount_b1::Int64, popcount_b2::Int64)::Float64
2.0 * popcount(and(b1, b2)) / (popcount_b1 + popcount_b2)
end
"""
Pairwise Sørensen-Dice Coefficient with thresholding.
"""
function filtered_pairwise_dice_comparison!(results::R, A::V, B::V, threshold::Float64) where {V<:AbstractVector{BitArray{1}}, R<:AbstractArray{Float64, 2}}
popcount_a_i::Int64 = 0
A_popcounts = popcount(A)
B_popcounts = popcount(B)
for i ∈ 1:size(A,1)
b1 = A[i]
popcount_a_i = A_popcounts[i]
for j ∈ 1:size(B,1)
d = inner_dice_cmp(b1, B[j], popcount_a_i, B_popcounts[j])
if d >= threshold
@inbounds results[i, j] = d
end
end
end
return results
end
function benchmark_pairwise(pairwise_function; narrays = 5000)
array_bytes = 128
function generate_random_bitarrays(narrays, array_bytes)
bitarrays = [BitArray(rand(Bool, array_bytes * 8)) for i in 1:narrays];
end
function perturb!(bitarray, modifications=512)
indicies_to_modify = sample(1:length(bitarray), modifications, replace=false, ordered=true)
bitarray[indicies_to_modify] = rand(Bool, modifications)
end
population = generate_random_bitarrays(2 * narrays, array_bytes)
tiny = sample(population, 100)
as = sample(population, narrays)
bs = sample(population, narrays)
for b in bs
perturb!(b)
end
println(size(as), size(as[1]))
# Call the function with a tiny dataset to ensure it is compiled
pairwise_function(tiny, tiny)
time_elapsed = @elapsed results = pairwise_function(as, bs)
println("Total comparisons: $(length(as)*length(bs))")
println("time elapsed: ", time_elapsed)
println("Comparison rate: $(length(as)*length(bs)/(1e6*time_elapsed)) million/second")
end
@info "Benchmarking pure Julia sparse vector implementation"
function f5(a, b)
results = spzeros(Float64, length(a), length(b))
filtered_pairwise_dice_comparison!(results, a, b, 0.95)
return results
end
benchmark_pairwise(f5, narrays=10000);
```

On my machine I get around 9 million comparisons per second from that code, and 100 to 120 million/s with Python+C (`pip install anonlink; python -m anonlink.benchmark`

).

Appreciate any hints to speed it up. Early CUDA version is currently running around 800 million/second without data transfer, and 200 million/second including copying (sparse) data back to host.