using CSV
using StringDistances
using Unicode
data = CSV.File(raw"C:\Users\mthel\Julia\src_data\data.csv", normalizenames=true)
data = clean_strings(data[i][2] for i in 1:200)
function build_dict(data)
dict = Dict(data[1] => [])
for i in 2:length(data), (key, value) in dict
if compare(data[i], key, Levenshtein()) > 0.7
push!(value, data[i])
else
dict[data[i]] = []
end
end
dict
end
dict = build_dict(data)
When I examine the keys of dict
, I see that the following strings have ended up as keys:
TRAFFICCONTROLTECH
TRAFFICCONTROLTEHC
TRAFFICCONTROTECH
but the Levenshtein similarity scores are all higher than the specified threshold of 0.7 (on line 13).
julia> compare("TRAFFICCONTROLTECH", "TRAFFICCONTROLTEHC", Levenshtein())
0.8888888888888888
julia> compare("TRAFFICCONTROLTECH", "TRAFFICCONTROTECH", Levenshtein())
0.9444444444444444
Only one of these should have ended up as a key and the other two should have been detected in the if
clause and subsequently pushed to the value
associated with the existing key. I’m stumped as to why this is happening. There has to be a silly mistake somewhere that I’m missing…