Iβd do it the following way: First, you pad the relevant matrix sizes to multiples of 64. Then

```
julia> using LinearAlgebra, Random, BenchmarkTools
julia> function bitmulAndTranspose(A, B)
res = falses((size(A,1), size(B,2)))
AT = falses((size(A,2), size(A,1)))
transpose!(AT, A)
bitmul(res, AT, B)
res
end
bitmulAndTranspose (generic function with 1 method)
julia> function bitmul(res, AT, B)
(size(B,1) % 64 == 0 && size(AT,1) == size(B,1) && size(res) == (size(AT,2), size(B,2))) || begin
@show size(AT), size(B), size(res)
throw("dim mismatch")
end
ATc = reshape(AT.chunks, (cld(size(AT, 1), 64), size(AT,2)))
Bc = reshape(B.chunks, (cld(size(B, 1), 64), size(B,2)))
#here you can do blocking / recursion / hilbert-curve / multithreading. Make it cache oblivious and make all cores busy.
@inbounds for i in 1:size(AT, 2)
for j in 1:size(B,2)
r=0
@simd for k = 1:size(ATc, 1)
r += count_ones(ATc[k, i] & Bc[k, j])
end
res[i, j] = isodd(r)
end
end
end
bitmul (generic function with 1 method)
julia> B0=bitrand(64, 3); A0=bitrand(4, 64); reference = isodd.(A0*B0); reference == bitmulAndTranspose(A0,B0)
true
julia> basemul(A,B)=isodd.(A*B);
julia> N=1<<10; B0=bitrand(N,N); A0=bitrand(N,N);
julia> @benchmark basemul($A0,$B0)
BenchmarkTools.Trial: 18 samples with 1 evaluation.
Range (min β¦ max): 285.791 ms β¦ 289.420 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 287.569 ms β GC (median): 0.00%
Time (mean Β± Ο): 287.544 ms Β± 1.063 ms β GC (mean Β± Ο): 0.02% Β± 0.04%
β β β β β β β ββ βββ ββ β β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
286 ms Histogram: frequency by time 289 ms <
Memory estimate: 8.14 MiB, allocs estimate: 9.
julia> @benchmark bitmulAndTranspose($A0, $B0)
BenchmarkTools.Trial: 774 samples with 1 evaluation.
Range (min β¦ max): 6.095 ms β¦ 9.673 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 6.431 ms β GC (median): 0.00%
Time (mean Β± Ο): 6.460 ms Β± 226.942 ΞΌs β GC (mean Β± Ο): 0.03% Β± 0.36%
ββββββββ
βββ
ββββββββββ
βββ
ββββββββββββββββ
ββββββββββββββββββββββββββββββ β
6.1 ms Histogram: frequency by time 7.01 ms <
Memory estimate: 256.38 KiB, allocs estimate: 10.
julia> function nobitmulAndTranspose(A, B)
res = falses((size(A,1), size(B,2)))
AT = falses((size(A,2), size(A,1)))
transpose!(AT, A)
#bitmul(res, AT, B)
res
end
julia> @benchmark nobitmulAndTranspose($A0, $B0)
BenchmarkTools.Trial: 8794 samples with 1 evaluation.
Range (min β¦ max): 548.395 ΞΌs β¦ 1.398 ms β GC (min β¦ max): 0.00% β¦ 32.02%
Time (median): 561.562 ΞΌs β GC (median): 0.00%
Time (mean Β± Ο): 567.101 ΞΌs Β± 35.237 ΞΌs β GC (mean Β± Ο): 0.35% Β± 2.80%
β
ββ ββββ
βββββ β
βββββββββββββββββββββββββββ
ββββ
ββββββββ
ββ
ββ
βββ
ββββββββββββββ β
548 ΞΌs Histogram: log(frequency) by time 678 ΞΌs <
Memory estimate: 256.19 KiB, allocs estimate: 6.
julia> function mult2(x ::BitMatrix, y ::BitMatrix) ::BitMatrix
l, m = size(x)
m_, n = size(y)
@assert m==m_
z = falses(l, n)
xt = BitMatrix(x')
colsy = BitVector.(eachcol(y))
colx = falses(m)
@inbounds for i=1:l
colx .= @view xt[:,i]
for k in 1:n
zik = 0
for j in eachindex(colx.chunks)
zik β»= colsy[k].chunks[j] & colx.chunks[j]
end
z[i,k] = count_ones(zik) %2
end
end
return z
end
mult2 (generic function with 1 method)
julia> @benchmark mult2($A0, $B0)
BenchmarkTools.Trial: 365 samples with 1 evaluation.
Range (min β¦ max): 13.225 ms β¦ 15.297 ms β GC (min β¦ max): 0.00% β¦ 7.62%
Time (median): 13.599 ms β GC (median): 0.00%
Time (mean Β± Ο): 13.721 ms Β± 390.461 ΞΌs β GC (mean Β± Ο): 0.84% Β± 2.31%
βββββββββ
βββββ
ββ
βββββββββββββββ
ββββββββββββββββββββββββββββββββββββββ β
13.2 ms Histogram: frequency by time 14.9 ms <
Memory estimate: 4.66 MiB, allocs estimate: 3081.
```

The julia implementation of `transpose!`

for bit-matrices is surprisingly good! I was very close to passing on this question because doing fast bitmatrix transpose is very annoying, but itβs batteries included for that.

PS. Oh, shiny trick in mult2: Accumulate with `xor`

instead of `+count_ones`

, only do the POPCNT at the end.

Alas, thatβs slower on my machine. It should not be slower (popcount is hard to vectorize on x86, the built-in instruction only exists for 64bit words, not for big fat 256 or 512 bit registers). This is very very wrong and indicates that llvm messes up somewhere. You may need to write the inner loop by hand, using SIMD.jl