String indices : byte indexing feels wrong

To demonstrate codeunits, here’s an example.

julia> function test()
           s = "\u20ac\U168E0\u07F7"
           c = codeunits(s)
           println(length(c))
           println(c[1])
           println(c[4])
           println(c[8])
           display(c)
       end
test (generic function with 1 method)

julia> test()
9
226
240
223
9-element Base.CodeUnits{UInt8, String}:
 0xe2
 0x82
 0xac
 0xf0
 0x96
 0xa3
 0xa0
 0xdf

Another point I want to make which I’m not sure if it is clearly made above is that you can iterate through characters in a String easily via a for loop.

julia> for c in s
           println(c)
       end
€
𖣠
߷

julia> eachindex(s)
Base.EachStringIndex{String}("€𖣠 ߷")

julia> for i in eachindex(s)
           println(i, ": ", s[i])
       end
1: €
4: 𖣠
8: ߷

Some of your question implies that you mainly want to address interface issues. Sometimes we overthink deeper issues, when people just want to customize the interface. Let’s create a type that wraps around a Vector{Char} as demonstrated above. The main advantage of making a wrapper type like this is that you can write your own convert methods for it without commiting type priacy.

julia> begin
           struct EagerStringChars <: AbstractVector{Char}
               s::Vector{Char}
           end
           EagerStringChars(s::AbstractString) = EagerStringChars(collect(s))

           # Implement minimal array interface:
           # https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-array
           Base.IndexStyle(::Type{EagerStringChars}) = IndexLinear()
           Base.getindex(esc::EagerStringChars, i) = esc.s[i]
           Base.size(esc::EagerStringChars) = size(esc.s)

           Base.convert(::Type{String}, esc::EagerStringChars) = String(esc.s)
           Base.convert(::Type{EagerStringChars}, s::AbstractString) = EagerStringChars(s)
       end

julia> s = "\u20ac\U000168E0\u07F7"
"€𖣠 ߷"

julia> esc = EagerStringChars(s)
3-element EagerStringChars:
 '€': Unicode U+20AC (category Sc: Symbol, currency)
 '𖣠': Unicode U+168E0 (category Lo: Letter, other)
 '߷': Unicode U+07F7 (category Po: Punctuation, other)

julia> esc[1]
'€': Unicode U+20AC (category Sc: Symbol, currency)

julia> esc[2]
'𖣠 ': Unicode U+168E0 (category Lo: Letter, other)

julia> esc[3]
'߷': Unicode U+07F7 (category Po: Punctuation, other)

julia> strings = String[esc]
1-element Vector{String}:
 "€𖣠߷"

julia> push!(strings, esc)
2-element Vector{String}:
 "€𖣠߷"
 "€𖣠߷"

julia> string_chars = EagerStringChars[s]
1-element Vector{EagerStringChars}:
 ['€', '𖣠', '߷']

We can also create a lazy version of this. Each time we index a string, we will iterate through the string and return the nth character.

julia> begin
           struct LazyStringChars{S} <: AbstractVector{Char}
               s::S
           end

           Base.IndexStyle(::Type{LazyStringChars}) = IndexLinear()
           function Base.getindex(lsc::LazyStringChars, i)
               # consider IterTools.nth
               checkindex(Bool, 1:length(lsc.s), i) || throw(ArgumentError("Out of bounds index $i"))
               n = 0
               for c in lsc.s
                   n += 1
                   n == i && return c
               end
           end
           Base.size(lsc::LazyStringChars) = (length(lsc.s),)

           Base.convert(::Type{String}, lsc::LazyStringChars) = String(lsc.s)
           Base.convert(::Type{LazyStringChars}, s::AbstractString) = LazyStringChars(s)
       end

julia> lsc = LazyStringChars(s)
3-element LazyStringChars{String}:
 '€': Unicode U+20AC (category Sc: Symbol, currency)
 '𖣠': Unicode U+168E0 (category Lo: Letter, other)
 '߷': Unicode U+07F7 (category Po: Punctuation, other)

julia> lsc[1]
'€': Unicode U+20AC (category Sc: Symbol, currency)

julia> lsc[2]
'𖣠 ': Unicode U+168E0 (category Lo: Letter, other)

julia> lsc[3]
'߷': Unicode U+07F7 (category Po: Punctuation, other)

julia> push!(strings, lsc)
3-element Vector{String}:
 "€𖣠߷"
 "€𖣠߷"
 "€𖣠߷"

julia> lazy_string_chars = LazyStringChars[s]
1-element Vector{LazyStringChars}:
 ['€', '𖣠', '߷']
4 Likes