So the cache-less variant (mainly so that I can grab it myself again from this thread when needed):
julia> @inline function vbinomial(n::Int64, ::Val{k}) where {k}
if @generated
rss = Expr(:block)
rs = rss.args
push!(rs, :(acc = 1))
for i=0:k-1
push!(rs, :(acc *= (n-$i)))
end
fac = factorial(k)
push!(rs, :(res = div(acc, $fac)) )
push!(rs, :(return res))
return rss
else
return binomial(n,k)
end
end
julia> struct lazy_bincacherow{k} <: AbstractVector{Int}
len::Int
end
julia> Base.length(bc::lazy_bincacherow) = bc.len; Base.size(bc::lazy_bincacherow)=(bc.len,); Base.getindex(::lazy_bincacherow{k}, i::Int) where k = vbinomial(i, Val(k));
julia> struct lazy_bincache
len::Int
end
julia> @inline Base.getindex(bc::lazy_bincache, k) = lazy_bincacherow{k}(bc.len)
julia> function bincache(max_k, len)
max_entry = binomial(len, max_k)
alt = vbinomial(len, Val(max_k))
alt == max_entry || throw(OverflowError())
return lazy_bincache(len)
end
With that:
julia> const bnc2_ = bincache(5, 1000);
julia> ac2 = i->idxtup(bnc2_, i, Val(4));
julia> (ac2.(r)) == ( ac.(r))
true
julia> @btime ac.($r); @btime ac2.($r);
2.005 ms (4 allocations: 312.61 KiB)
2.370 ms (4 allocations: 312.61 KiB)
With the optimized binomial, there is no significant difference in speed, regardless of whether we cache or not. It seems that I failed to appreciate that the relevant parts of the cache mostly stay in L1 (the vbinomial is only a handful of cycles). This works because there are no integer divisions (llvm turns div(n, k) into multiplication and shift, if k is known at compile-time).