To demonstrate codeunits
, here’s an example.
julia> function test()
s = "\u20ac\U168E0\u07F7"
c = codeunits(s)
println(length(c))
println(c[1])
println(c[4])
println(c[8])
display(c)
end
test (generic function with 1 method)
julia> test()
9
226
240
223
9-element Base.CodeUnits{UInt8, String}:
0xe2
0x82
0xac
0xf0
0x96
0xa3
0xa0
0xdf
Another point I want to make which I’m not sure if it is clearly made above is that you can iterate through characters in a String
easily via a for
loop.
julia> for c in s
println(c)
end
€
𖣠
߷
julia> eachindex(s)
Base.EachStringIndex{String}("€𖣠 ߷")
julia> for i in eachindex(s)
println(i, ": ", s[i])
end
1: €
4: 𖣠
8: ߷
Some of your question implies that you mainly want to address interface issues. Sometimes we overthink deeper issues, when people just want to customize the interface. Let’s create a type that wraps around a Vector{Char}
as demonstrated above. The main advantage of making a wrapper type like this is that you can write your own convert
methods for it without commiting type priacy.
julia> begin
struct EagerStringChars <: AbstractVector{Char}
s::Vector{Char}
end
EagerStringChars(s::AbstractString) = EagerStringChars(collect(s))
# Implement minimal array interface:
# https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-array
Base.IndexStyle(::Type{EagerStringChars}) = IndexLinear()
Base.getindex(esc::EagerStringChars, i) = esc.s[i]
Base.size(esc::EagerStringChars) = size(esc.s)
Base.convert(::Type{String}, esc::EagerStringChars) = String(esc.s)
Base.convert(::Type{EagerStringChars}, s::AbstractString) = EagerStringChars(s)
end
julia> s = "\u20ac\U000168E0\u07F7"
"€𖣠 ߷"
julia> esc = EagerStringChars(s)
3-element EagerStringChars:
'€': Unicode U+20AC (category Sc: Symbol, currency)
'𖣠': Unicode U+168E0 (category Lo: Letter, other)
'߷': Unicode U+07F7 (category Po: Punctuation, other)
julia> esc[1]
'€': Unicode U+20AC (category Sc: Symbol, currency)
julia> esc[2]
'𖣠 ': Unicode U+168E0 (category Lo: Letter, other)
julia> esc[3]
'߷': Unicode U+07F7 (category Po: Punctuation, other)
julia> strings = String[esc]
1-element Vector{String}:
"€𖣠߷"
julia> push!(strings, esc)
2-element Vector{String}:
"€𖣠߷"
"€𖣠߷"
julia> string_chars = EagerStringChars[s]
1-element Vector{EagerStringChars}:
['€', '𖣠', '߷']
We can also create a lazy version of this. Each time we index a string, we will iterate through the string and return the nth character.
julia> begin
struct LazyStringChars{S} <: AbstractVector{Char}
s::S
end
Base.IndexStyle(::Type{LazyStringChars}) = IndexLinear()
function Base.getindex(lsc::LazyStringChars, i)
# consider IterTools.nth
checkindex(Bool, 1:length(lsc.s), i) || throw(ArgumentError("Out of bounds index $i"))
n = 0
for c in lsc.s
n += 1
n == i && return c
end
end
Base.size(lsc::LazyStringChars) = (length(lsc.s),)
Base.convert(::Type{String}, lsc::LazyStringChars) = String(lsc.s)
Base.convert(::Type{LazyStringChars}, s::AbstractString) = LazyStringChars(s)
end
julia> lsc = LazyStringChars(s)
3-element LazyStringChars{String}:
'€': Unicode U+20AC (category Sc: Symbol, currency)
'𖣠': Unicode U+168E0 (category Lo: Letter, other)
'߷': Unicode U+07F7 (category Po: Punctuation, other)
julia> lsc[1]
'€': Unicode U+20AC (category Sc: Symbol, currency)
julia> lsc[2]
'𖣠 ': Unicode U+168E0 (category Lo: Letter, other)
julia> lsc[3]
'߷': Unicode U+07F7 (category Po: Punctuation, other)
julia> push!(strings, lsc)
3-element Vector{String}:
"€𖣠߷"
"€𖣠߷"
"€𖣠߷"
julia> lazy_string_chars = LazyStringChars[s]
1-element Vector{LazyStringChars}:
['€', '𖣠', '߷']