Here is another implementation, using a different algorithm, which potentially could be better.
The idea is to use the generation of tagmap
, which makes the per tag vectors of ids sorted. So by walking the sorted vectors of all a post’s tags simultaneously, one can infer the amount of tag overlap.
Explanation is a little tricky, but here is the code:
function related_new(posts, topn = 5)
tagmap = Dict{String,Vector{Int}}()
maxtags = 0
for (idx, post) in enumerate(posts)
maxtags = max(maxtags, length(post.tags))
for tag in post.tags
setindex!(tagmap, push!(get(()->Int[], tagmap, tag), idx), tag)
end
end
topnids = Vector{Int}(undef, topn)
topnoverlaps = zeros(Int, topn)
p = zeros(Int, maxtags)
indxs = Vector{Int}(undef, maxtags)
ids = Vector{Int}(undef, maxtags)
tagvecs = Vector{Vector{Int}}(undef, maxtags)
res = Vector{RelatedPost}(undef, length(posts))
for (i, post) in enumerate(posts)
topnoverlaps .= 0
ntags = length(post.tags)
indxs[1:ntags] .= 1
for i in 1:ntags
tagvecs[i] = tagmap[post.tags[i]]
end
ids[1:ntags] .= getindex.(@view(tagvecs[1:ntags]), @view(indxs[1:ntags]))
sortperm!(@view(p[1:ntags]), @view(ids[1:ntags]))
lastid = ids[p[1]]
overlap = 1
while ids[p[1]] < length(posts)
indxs[p[1]] += 1
ids[p[1]] = indxs[p[1]] > length(tagvecs[p[1]]) ? length(posts)+1 : tagvecs[p[1]][indxs[p[1]]]
t = 2
while t<=ntags && ids[p[t-1]] > ids[p[t]]
p[t-1], p[t] = p[t], p[t-1]
t += 1
end
if ids[p[1]] == lastid
overlap += 1
else
if lastid != i
if overlap > topnoverlaps[1]
topnoverlaps[1] = overlap
topnids[1] = lastid
end
s = 2
while s <= topn && topnoverlaps[s] < topnoverlaps[s-1]
topnoverlaps[s-1], topnoverlaps[s] = topnoverlaps[s], topnoverlaps[s-1]
topnids[s-1], topnids[s] = topnids[s], topnids[s-1]
s += 1
end
end
overlap = 1
lastid = ids[p[1]]
end
end
reverse!(topnids)
res[i] = RelatedPost(post._id, post.tags, @view posts[topnids])
end
return res
end
Since it doesn’t have a taggedpostcount
vector and doesn’t need to sort anything, it has the potentioal to be faster, perhaps more so for larger post numbers.
The benchmark is not too good, but maybe there are some more optimizations.