This looks really neat, I’ve separately been thinking about implementing a fancy line-breaking algorithm for paragraph display in the terminal.
Seeing that you have a list of hyphenation patterns, I’ve given making a hyphenator a go. In the process I noticed that you have "% hyphen.tex patterns end here, and additional patterns begin:"
in your patterns
array.
You also don’t seem to implement the hyphenation weight overlaying/maximisation. I’ve implemented my own little hyphenation matcher using a Trie that may be of some interest:
include("hyphen-patterns.jl")
struct HyphenTrie
children::NTuple{length('a':'z')+1, Union{HyphenTrie, Nothing}}
breakpoints::Union{Nothing, Vector{Pair{UInt8, UInt8}}}
end
HyphenTrie() = HyphenTrie(ntuple(_ -> nothing, length('a':'z')+1), nothing)
function charindex(ord::Integer)
if ord == Int('.')
1
elseif Int('A') <= ord <= Int('Z')
2 + ord - Int('A')
elseif Int('a') <= ord <= Int('z')
2 + ord - Int('a')
else
0
end
end
charindex(c::Char) = charindex(Int(c))
charindex(s::AbstractString, i::Int) = charindex(codeunit(s, i))
function parsepattern(pat::AbstractString)
indices = UInt8[]
breakweights = UInt8[0x00]
lastwasweight = false
for char in pat
if isdigit(char)
breakweights[end] = parse(UInt8, char)
else
push!(indices, charindex(char))
char != '.' && push!(breakweights, 0)
end
end
breakpoints = [(i - 1) % UInt8 => w for (i ,w) in enumerate(breakweights) if w > 0]
indices, breakpoints
end
function trieadd(node::HyphenTrie, indices::Vector{UInt8}, breakpoints::Vector{Pair{UInt8, UInt8}}, depth::Int = 1)
depth > length(indices) &&
return HyphenTrie(node.children, breakpoints)
ind = indices[depth]
child = node.children[ind]
newchild = trieadd(@something(child, HyphenTrie()),
indices, breakpoints, depth + 1)
newchildren = Base.setindex(node.children, newchild, ind)
HyphenTrie(newchildren, node.breakpoints)
end
function HyphenTrie(library::Vector{<:AbstractString})
root = HyphenTrie()
for word in library
indices, breakpoints = parsepattern(word)
root = trieadd(root, indices, breakpoints)
end
root
end
function Base.show(io::IO, ::MIME"text/plain", ht::HyphenTrie)
indent = get(io, :ht_indent, 0)
if !isnothing(ht.breakpoints)
for (i, (o, w)) in enumerate(ht.breakpoints)
i > 1 && print(io, ", ")
printstyled(io, "[$o]", color = :cyan)
printstyled(io, " => ", color = :light_black)
printstyled(io, w, color = :red)
end
print(io, '\n', ' '^indent) # Runs once more than it should at the end, but oh well
end
for (i, char) in enumerate(vcat('.', 'a':'z'))
child = ht.children[i]
if !isnothing(child)
print(io, "-> ")
printstyled(io, char, bold = true, color = :blue)
print(io, ' ')
show(IOContext(io, :ht_indent => indent + 5), MIME("text/plain"), child)
end
end
end
function descend!(breakpoints::Vector{UInt8}, ht::HyphenTrie, word::AbstractString, start::Int = 1, pos::Int = ifelse(start == 1, 0, start))
if pos == 0
if !isnothing(ht.children[1])
return descend!(breakpoints, ht.children[1], word, start, 1)
else
pos += 1
end
end
letteridx = charindex(word, pos)
letteridx < 1 && return
child = ht.children[letteridx]
isnothing(child) && return
if !isnothing(child.breakpoints)
for (offset, weight) in child.breakpoints
breakpoints[start + offset - 1] = max(breakpoints[start + offset - 1], weight)
end
end
npos = nextind(word, pos)
if npos <= ncodeunits(word)
descend!(breakpoints, child, word, start, npos)
end
breakpoints
end
Base.getindex(ht::HyphenTrie, c::Char) = ht.children[charindex(c)]
function Base.getindex(ht::HyphenTrie, word::AbstractString)
bps = zeros(UInt8, ncodeunits(word))
cumwidth = zeros(UInt16, ncodeunits(word))
pos = firstindex(word)
while pos < ncodeunits(word)
descend!(bps, ht, word, pos)
posnext = nextind(word, pos)
for i in pos:posnext-1
cumwidth[i] = cumwidth[max(1, pos-1)] + textwidth(word[pos])
end
pos = posnext
end
[(; index = i % UInt16, prewidth = cumwidth[i], weight = w)
for (i, w) in enumerate(bps) if !iseven(w) && i < length(bps)]
end
Example output:
julia> ht["hyphenation"]
1-element Vector{@NamedTuple{index::UInt16, prewidth::UInt16, weight::UInt8}}:
(index = 0x0006, prewidth = 0x0006, weight = 0x05)
julia> ht["antidisestablishemntarianism"]
10-element Vector{@NamedTuple{index::UInt16, prewidth::UInt16, weight::UInt8}}:
(index = 0x0002, prewidth = 0x0002, weight = 0x05)
(index = 0x0007, prewidth = 0x0007, weight = 0x01)
(index = 0x0009, prewidth = 0x0009, weight = 0x05)
(index = 0x000c, prewidth = 0x000c, weight = 0x03)
(index = 0x0012, prewidth = 0x0012, weight = 0x01)
(index = 0x0013, prewidth = 0x0013, weight = 0x01)
(index = 0x0016, prewidth = 0x0016, weight = 0x03)
(index = 0x0017, prewidth = 0x0017, weight = 0x03)
(index = 0x0019, prewidth = 0x0019, weight = 0x03)
(index = 0x001b, prewidth = 0x001b, weight = 0x01)
julia> ht["5α-dihydrotestosterone"]
3-element Vector{@NamedTuple{index::UInt16, prewidth::UInt16, weight::UInt8}}:
(index = 0x0006, prewidth = 0x0005, weight = 0x03)
(index = 0x000b, prewidth = 0x000a, weight = 0x05)
(index = 0x0011, prewidth = 0x0010, weight = 0x05)