BTW, your code returns 0 for every word because you’re doing get!(counter, word, 0) + 1 but not actually putting the increased count back into the dict.
function wc(lines, counter=Dict{String, Int}())
for line in lines, word in split(line)
l = lowercase(word)
counter[l] = get(counter, l, 0) + 1
end
counter
end
The following avoids doing the hash lookup for the dict twice by retrieving a Ref value and updating that
function wc(lines, counter=Dict{String, Ref{Int}}())
for line in lines, word in split(line)
l = lowercase(word)
c = get(counter, l, Ref{Int}(0))
c[] += 1
end
counter
end
@btime wc(readlines("/home/fredrikb/Downloads/kjvbible.txt"))
# 281.450 ms (2918978 allocations: 162.72 MiB) original
# 228.156 ms (3740085 allocations: 173.83 MiB) Ref
EDIT: See @ericphanson s post below, get should be replaced by get! for the behaviour to be correct, after which this implementation is not actually faster Thanks for pointing it out though Eric
You could also implement a SubString wrapper type with custom isequal and hash functions that ignore case (and maybe cache the hash in the struct). And you could avoid allocating the array of split results by simply looping within the line and creating your substring objects as you go along. And you could avoid allocating strings for the lines by simply mmapping the file and using a StringView of the mmap array or portions thereof, and looping through the bytes looking for whitespace.
i.e. you could cut down on the number of heap allocations enormously if you really cared about optimizing this (in the same way that CSV.jl hugely optimized parsing csv files). (You could even specialize things for ASCII if you know the text is ASCII-only.)
I think the spirit of the challenge is to first solve the problem in the easiest idiomatic way. The second step is to provide the optimized version which may not be straightforward to implement.
I do not know, this one looks rather idiomatic and simple (sorry, I didn’t use Ref trick, it is cool, but more advanced)
function wc(io, counter=Dict{String, Int}())
for line in eachline(io)
for word in split(line)
word = lowercase(word)
counter[word] = get!(counter, word, 0) + 1
end
end
counter
end
sortdict(counter) = sort!(collect(counter), by=x->x[2], rev=true)
wc(stdin) |> sortdict .|> kv->println(kv[1], " ", kv[2])
These changes reduced time from 1.2s to 0.97 on my laptop, when I am running this command.
time julia simple.jl < /tmp/kjvbible.txt >> /dev/null
P.S.: during Advent of Code someone wrote iterating version of split, would be nice to have it somewhere accessible (maybe even stdlib?), then it would be possible to write for word in Iterators.split(line) instead of for word in split(line)
In particular, here is one that avoids allocating any strings until the end, by using a custom type for the keys that does a case-insensitive hash in-place on a view of the mmapped file, and specializing for ASCII data. On my machine, @btime wordcount3("kjvbible.txt") is 2–3x faster than those above.
using Mmap
# custom array-like ASCII-case-insensitive bytes-view object
# with fast case-insensitive hash
struct MyStringView{V<:AbstractVector{UInt8}} <: AbstractVector{UInt8}
data::V
end
lc(byte::UInt8) = UInt8('A') ≤ byte ≤ UInt8('Z') ? byte + (UInt8('a')-UInt8('A')) : byte
Base.size(a::MyStringView) = size(a.data)
Base.@propagate_inbounds Base.getindex(a::MyStringView, i::Int) = lc(a.data[i])
const FNV_prime = UInt(sizeof(UInt) == 8 ? 1099511628211 : 16777619)
const FNV_offset = UInt(sizeof(UInt) == 8 ? 14695981039346656037 : 2166136261)
function Base.hash(a::MyStringView, h::UInt)
ha = FNV_offset
@inbounds for i = 1:length(a)
ha = (ha * FNV_prime) ⊻ a[i] # FNV-1 hash
end
return ha ⊻ h
end
iswordbyte(b::UInt8) = (b != UInt8(' ')) & (b != UInt8('\n'))
@views function wc3(io)
bytes = Mmap.mmap(io)
words = Dict{MyStringView{typeof(bytes[1:0])},typeof(Ref(0))}()
sizehint!(words, length(bytes) ÷ 10)
s = 1
while true
s = findnext(iswordbyte, bytes, s)
isnothing(s) && break
e = something(findnext(!iswordbyte, bytes, s+1), length(bytes)+1)
word = MyStringView(bytes[s:e-1])
c = get!(words, word, Ref(0))
c[] += 1
s = e+1
end
return sort!([String(k) => v[] for (k,v) in words], by=x->x[2], rev=true)
end
wordcount3(filename) = open(wc3, filename)
It could be optimized in further ways. For example, I experimented with implementing a get!-like function that increments the dictionary contents in-place (or initializes the count to 1) without Ref, by copying the get! source code from base/dict.jl, but it only gained me an additional 10%. Probably you could gain additional speed for hashing etcetera by examining several bytes at once — you can do lots of optimizations if you know you have ASCII data.
Update: simplified the above code by making MyStringView a subtype of AbstractVector, which cuts down on the number of methods I needed to implement. Also updated to use findnext instead of explicit loops, which is shorter while being just as fast.
I also just tried ascii optimizations, although it seems the bottleneck is the dict insertions. My idea was that it shouldn’t matter much to apply lowercasing to every string, maybe it’s better to just do that at the end when the number strings is really small.
using Mmap
function wordcount()
arr = Mmap.mmap(expanduser("~/Downloads/kjvbible.txt"))
start = 0
finish = 0
within_word = false
counts = Dict{typeof(view(arr, 1:2)), Int}()
for (i, byte) in enumerate(arr)
# A-Z, a-z
if (0x41 <= byte <= 0x5a) || (0x61 <= byte <= 0x7a)
if !within_word
start = i
within_word = true
end
elseif within_word
finish = i - 1
within_word = false
lastword = view(arr, start:finish)
counts[lastword] = get!(counts, lastword, 0) + 1
end
end
stringcounts = Dict{String, Int}()
for (key, value) in counts
lckey = lowercase(String(key))
stringcounts[lckey] = get!(stringcounts, lckey, 0) + value
end
sort!(collect(stringcounts), by = last, rev = true)
end
@time wordcount()
0.237530 seconds (67.86 k allocations: 8.770 MiB)
I tried using Automata.jl along with your custom string type, and it benchmarked basically exactly the same as your code when I used a view, but without a view it was almost twice as slow as your code. Adding a view to your code doesn’t seem to change the timings there for me though. (I know using packages is against the rules of the blog post, I just wanted to try it out).
# needs the code in https://discourse.julialang.org/t/count-words-challenge/57221/17 to be run first
using Automa, BenchmarkTools, Mmap
using Automa.RegExp: space, neg, opt, rep
word = neg(space())
doc = opt(word) * rep(space() * word) * space()
word.actions[:exit] = [:exit_word]
word.actions[:enter] = [:enter_word]
machine = Automa.compile(doc)
actions = Dict(:enter_word => quote
mark = p
end,
:exit_word => quote
c = get!(counter, MyStringView(@view data[mark:p-1]), Ref{Int}(0))
c[] += 1
end)
# Generate a function using @eval.
context = Automa.CodeGenContext()
@eval function wordcount4(filename)
data = Mmap.mmap(filename)
counter = Dict{MyStringView{typeof(@view data[2:3])},typeof(Ref(0))}()
mark = 0
# generate code to initialize variables used by FSM
$(Automa.generate_init_code(context, machine))
# set end and EOF positions of data buffer
p_end = p_eof = lastindex(data)
# generate code to execute FSM
$(Automa.generate_exec_code(context, machine, actions))
# check if FSM properly finished
if cs != 0
error("failed to count words on byte $p")
end
# assemble the result with SGJ's code
return sort!([String(k) => v[] for (k,v) in counter], by=x->(x[2],x[1]), rev=true)
end
@btime wordcount4("input.txt")