I’m running some string comparisons using StringDistances
and have written some simple convenience functions that are loosely resembling the extract
function in the Python fuzzywuzzy
package. Essentially they compare a string to a list of strings and return either the best match alongside the similarity score, or the n
best matches and their similarity scores.
My question is: why does the function that returns the n
best matches and their scores take the same time as the function that only returns one match and score? I’m aware that the compiler does some pretty smart optimisations, but find it hard to believe that all of the additional work for returning multiple results is free, so I must be missing something?
Code below (results are obtained using juliabox):
using StringDistances, BenchmarkTools
master = [lowercase.(randstring(8)) for i = 1:1200]
to_match = [lowercase.(randstring(8)) for i = 1:1000];
function get_matches(s::String, masterlist::Vector{String}; criterion=Levenshtein(), matches=3)
closeness = [compare(criterion, s, masterlist[i]) for i = 1:length(masterlist)]
sort_close = sort(closeness, rev=true)
ret = Array{Tuple{String,Float64},1}(matches)
@inbounds for i ∈ 1:matches
ret[i] = (masterlist[find(closeness .== sort_close[i])[1]], sort_close[i])
end
return ret
end
function get_one_match(s::String, masterlist::Vector{String}; criterion=Levenshtein())
closeness = [compare(criterion, s, masterlist[i]) for i = 1:length(masterlist)]
return (masterlist[indmax(closeness)], maximum(closeness))
end
@btime get_one_match(to_match[1], master, criterion=TokenMax(RatcliffObershelp()))
# 9.063 ms (92513 allocations: 6.79 MiB)
# ("ksifwytx", 0.5)
@btime get_matches(to_match[1], master, criterion=TokenMax(RatcliffObershelp()))
# 9.031 ms (92542 allocations: 6.82 MiB)
# ("ksifwytx", 0.5)
# ("cpyztasz", 0.375)
# ("cpyztasz", 0.375)