Did an experiment. The results were indeed very fast.

using Missings
# define an abstract array with two bits
struct TFM
tf::BitArray
ms::BitArray
end
# a function to count two bit arrays
function fcount_bitarray2(tfm1::TFM)
mscnt = sum(tfm1.ms)
tcnt = sum(tfm1.tf) - sum(tfm1.ms .& tfm1.tf)
Dict{Union{Bool, Missing}, Int}(true => tcnt, missing => mscnt, false => length(tfm1.ms) - tcnt - mscnt)
end
# create the data
a = [missing, true, false]
x = rand(a, 200_000_000)
y = Vector{Bool}(200_000_000)
ms = ismissing.(x)
y[ms] = rand(Bool,sum(ms))
y[.!ms] = x[.!ms]
tfm1 = TFM(y, ismissing.(x))
## run the below benchmarks on next.juliabox.com
gc()
@time res4 = fcount_bitarray2(tfm1) # 0.044 on next.juliabox.com
gc()
@time res2 = fcount2(x) # 4.28
gc()
@time res3 = my_count(x) # 3.24
gc()
res3 == res2 && res2 == res4