Threads.@threads for...
seems faster than pool = CachingPool(workers())
on my architecture (Apple Silicon M1 arm64)
## Detection 1
### "Any code that is performance critical should be inside a function. Code inside functions tends to run much faster than top level code, due to how Julia's compiler works."
function detectwordinphrase(words::Vector{String},phrases::Vector{String})
# occurrences = [occursin.(x,phrases) for x::String = words]
occurrences = Vector{BitVector}(undef,length(words))
Threads.@threads for i = 1:size(words,1) # This loop takes all of the time
occurrences[i] = occursin.(words[i],phrases)
end
return sum(occurrences) # This takes no time at all
end;
@time textdata_occurs = detectwordinphrase(reference,textdata_textlong)
# Single threaded:
# 10k phrases 2k words : 0.929850 seconds (40.01 M allocations: 2.544 GiB, 7.86% gc time)
# 100k phrases 2k words : 11.996917 seconds (400.01 M allocations: 25.363 GiB, 7.66% gc time)
# 100k phrases 20k words : 112.146993 seconds (4.00 G allocations: 253.637 GiB, 11.36% gc time)
# Multithreaded 4 threads
# 100k phrases 2k words : 5.061899 seconds (400.01 M allocations: 25.363 GiB, 37.20% gc time)
# 100k phrases 20k words : 50.659987 seconds (4.00 G allocations: 253.655 GiB, 35.53% gc time)
# Multithreaded 8 Threads
# 100k phrases 20k words : 37.712681 seconds (4.00 G allocations: 253.655 GiB, 51.64% gc time, 0.16% compilation time)
## Detection 2:
using Distributed
function detectwordinphrase2(words::Vector{String}, phrases::Vector{String})
pool = CachingPool(workers())
return let words = words
pmap(pool, phrases) do phrase
return sum(occursin(word, phrase) for word in words)
end
end
end
@time textdata_occurs = detectwordinphrase2(reference,textdata_textlong)
# Multithreaded 4 Threads
# 100k phrases 20k words : 88.694093 seconds (4.00 G allocations: 238.460 GiB, 21.67% gc time, 0.18% compilation time)
# Multithreaded 8 Threads
# 100k phrases 20k words : 90.496826 seconds (4.00 G allocations: 238.460 GiB, 21.99% gc time, 0.17% compilation time)
But still nowhere near <2 secs for 1M phrases and 200k words in the R version (Text Mining: Detect Strings: Very Fast Word Lookup in a Large Dictionary in R with data.table and matrixStats – Maps and Spaces)