Uahgi - an experimential typesetting engine/compiler generating PDF document

Announcing uahgi, which is an experimental typesetting engine/compiler like “the little plainTeX” (but actually its functions are not as rich as plainTeX).

It’s a practice of the Knuth-liang hyphenation algorithm and a simplified line-wrapping algorithm. Besides, it supports Latin (but without combined diacritics) and CJK. It generate PDF with libharu, a lightweight C pdf generating library supports UTF-8.

The syntax is inspired by TeX(comment), MediaWiki template language, and LISP.

It’s a practice and like draught, so there are lots of function not implemented, including:

7 Likes

This looks really neat, I’ve separately been thinking about implementing a fancy line-breaking algorithm for paragraph display in the terminal.

Seeing that you have a list of hyphenation patterns, I’ve given making a hyphenator a go. In the process I noticed that you have "% hyphen.tex patterns end here, and additional patterns begin:" in your patterns array.

You also don’t seem to implement the hyphenation weight overlaying/maximisation. I’ve implemented my own little hyphenation matcher using a Trie that may be of some interest:

include("hyphen-patterns.jl")

struct HyphenTrie
    children::NTuple{length('a':'z')+1, Union{HyphenTrie, Nothing}}
    breakpoints::Union{Nothing, Vector{Pair{UInt8, UInt8}}}
end

HyphenTrie() = HyphenTrie(ntuple(_ -> nothing, length('a':'z')+1), nothing)

function charindex(ord::Integer)
    if ord == Int('.')
        1
    elseif Int('A') <= ord <= Int('Z')
        2 + ord - Int('A')
    elseif Int('a') <= ord <= Int('z')
        2 + ord - Int('a')
    else
        0
    end
end

charindex(c::Char) = charindex(Int(c))
charindex(s::AbstractString, i::Int) = charindex(codeunit(s, i))

function parsepattern(pat::AbstractString)
    indices = UInt8[]
    breakweights = UInt8[0x00]
    lastwasweight = false
    for char in pat
        if isdigit(char)
            breakweights[end] = parse(UInt8, char)
        else
            push!(indices, charindex(char))
            char != '.' && push!(breakweights, 0)
        end
    end
    breakpoints = [(i - 1) % UInt8 => w for (i ,w) in enumerate(breakweights) if w > 0]
    indices, breakpoints
end

function trieadd(node::HyphenTrie, indices::Vector{UInt8}, breakpoints::Vector{Pair{UInt8, UInt8}}, depth::Int = 1)
    depth > length(indices) &&
        return HyphenTrie(node.children, breakpoints)
    ind = indices[depth]
    child = node.children[ind]
    newchild = trieadd(@something(child, HyphenTrie()),
                       indices, breakpoints, depth + 1)
    newchildren = Base.setindex(node.children, newchild, ind)
    HyphenTrie(newchildren, node.breakpoints)
end

function HyphenTrie(library::Vector{<:AbstractString})
    root = HyphenTrie()
    for word in library
        indices, breakpoints = parsepattern(word)
        root = trieadd(root, indices, breakpoints)
    end
    root
end

function Base.show(io::IO, ::MIME"text/plain", ht::HyphenTrie)
    indent = get(io, :ht_indent, 0)
    if !isnothing(ht.breakpoints)
        for (i, (o, w)) in enumerate(ht.breakpoints)
            i > 1 && print(io, ", ")
            printstyled(io, "[$o]", color = :cyan)
            printstyled(io, " => ", color = :light_black)
            printstyled(io, w, color = :red)
        end
        print(io, '\n', ' '^indent) # Runs once more than it should at the end, but oh well
    end
    for (i, char) in enumerate(vcat('.', 'a':'z'))
        child = ht.children[i]
        if !isnothing(child)
            print(io, "-> ")
            printstyled(io, char, bold = true, color = :blue)
            print(io, ' ')
            show(IOContext(io, :ht_indent => indent + 5), MIME("text/plain"), child)
        end
    end
end

function descend!(breakpoints::Vector{UInt8}, ht::HyphenTrie, word::AbstractString, start::Int = 1, pos::Int = ifelse(start == 1, 0, start))
    if pos == 0
        if !isnothing(ht.children[1])
            return descend!(breakpoints, ht.children[1], word, start, 1)
        else
            pos += 1
        end
    end
    letteridx = charindex(word, pos)
    letteridx < 1 && return
    child = ht.children[letteridx]
    isnothing(child) && return
    if !isnothing(child.breakpoints)
        for (offset, weight) in child.breakpoints
            breakpoints[start + offset - 1] = max(breakpoints[start + offset - 1], weight)
        end
    end
    npos = nextind(word, pos)
    if npos <= ncodeunits(word)
        descend!(breakpoints, child, word, start, npos)
    end
    breakpoints
end

Base.getindex(ht::HyphenTrie, c::Char) = ht.children[charindex(c)]

function Base.getindex(ht::HyphenTrie, word::AbstractString)
    bps = zeros(UInt8, ncodeunits(word))
    cumwidth = zeros(UInt16, ncodeunits(word))
    pos = firstindex(word)
    while pos < ncodeunits(word)
        descend!(bps, ht, word, pos)
        posnext = nextind(word, pos)
        for i in pos:posnext-1
            cumwidth[i] = cumwidth[max(1, pos-1)] + textwidth(word[pos])
        end
        pos = posnext
    end
    [(; index = i % UInt16, prewidth = cumwidth[i], weight = w)
     for (i, w) in enumerate(bps) if !iseven(w) && i < length(bps)]
end

Example output:


julia> ht["hyphenation"]
1-element Vector{@NamedTuple{index::UInt16, prewidth::UInt16, weight::UInt8}}:
 (index = 0x0006, prewidth = 0x0006, weight = 0x05)

julia> ht["antidisestablishemntarianism"]
10-element Vector{@NamedTuple{index::UInt16, prewidth::UInt16, weight::UInt8}}:
 (index = 0x0002, prewidth = 0x0002, weight = 0x05)
 (index = 0x0007, prewidth = 0x0007, weight = 0x01)
 (index = 0x0009, prewidth = 0x0009, weight = 0x05)
 (index = 0x000c, prewidth = 0x000c, weight = 0x03)
 (index = 0x0012, prewidth = 0x0012, weight = 0x01)
 (index = 0x0013, prewidth = 0x0013, weight = 0x01)
 (index = 0x0016, prewidth = 0x0016, weight = 0x03)
 (index = 0x0017, prewidth = 0x0017, weight = 0x03)
 (index = 0x0019, prewidth = 0x0019, weight = 0x03)
 (index = 0x001b, prewidth = 0x001b, weight = 0x01)

julia> ht["5α-dihydrotestosterone"]
3-element Vector{@NamedTuple{index::UInt16, prewidth::UInt16, weight::UInt8}}:
 (index = 0x0006, prewidth = 0x0005, weight = 0x03)
 (index = 0x000b, prewidth = 0x000a, weight = 0x05)
 (index = 0x0011, prewidth = 0x0010, weight = 0x05)
2 Likes

The hyphenation weight overlaying might be around there:
uahgi/src/hyphenating.jl at main · Yoxem/uahgi · GitHub (Line 42 or function hyphenate_aux)

Is this the algorithm that measures the line-breaks and hyphenation using a “badness” score?

1 Like

Yes. I have read the simplified minimalizing badness line-wrapping algorithm (in Chinese Wikipedia “自動換行#最小破損度” (auto linewrapping # Mininal raggedness)) before implementing it.

The related function is there:

1 Like

I can’t say at a glance I understand everything going on there, but I don’t see any max / maximum calls.

Update: I think I see it

1 Like

BTW, this may not be something you’re interested in optimising, but I’ve been giving more thought to time/space efficient hyphenation rule tables. Let me know if this is of interest and I’ll keep on sharing my work :slight_smile:

Sorry, if I want to add the other features, the optimization of hyphenation is not in hight priority.
However, it’s welcome if you give me the pull request.

Cool, no trouble. I just wanted to make the offer :slight_smile:

2 Likes