This code seems to work, but I haven’t checked it carefully yet and it’s not complete. I should really get this into CategoricalArrays or Arrow.jl.
const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArray")
ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME
ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R)
ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME
ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} = string(R)
const REFTYPES = Dict(string(T) => T for T in (Int128, Int16, Int32, Int64, Int8, UInt128, UInt16, UInt32, UInt64, UInt8))
function ArrowTypes.JuliaType(::Val{Symbol("JuliaLang.CategoricalArray")}, ::Type{S}, meta::String) where S
R = REFTYPES[meta]
return CategoricalValue{S, R}
end
function Arrow.DictEncoding{V,S,A}(id, data::Arrow.List{U, O, B}, isOrdered, metadata) where {T, R, V<:CategoricalValue{T,R}, S, O, A, B, U}
newdata = Arrow.List{T, O, B}(data.arrow, data.validity, data.offsets, data.data, data.ℓ, data.metadata)
catdata = CategoricalVector{T,R}(newdata, levels=newdata)
return Arrow.DictEncoding{V,S,typeof(catdata)}(id, catdata, isOrdered, metadata)
end
function Arrow.DictEncoding{V,S,A}(id, data::Arrow.Primitive{U, B}, isOrdered, metadata) where {T, R, V<:CategoricalValue{T,R}, S, A, B, U}
newdata = Arrow.Primitive{T, B}(data.arrow, data.validity, data.data, data.ℓ, data.metadata)
catdata = CategoricalVector{T,R}(newdata, levels=newdata)
return Arrow.DictEncoding{V,S,typeof(catdata)}(id, catdata, isOrdered, metadata)
end
function Arrow.DictEncoding{Union{Missing,V},S,A}(id, data::Arrow.List{U, O, B}, isOrdered, metadata) where {T, R, V<:CategoricalValue{T,R}, S, O, A, B, U}
newdata = Arrow.List{Union{Missing,T}, O, B}(data.arrow, data.validity, data.offsets, data.data, data.ℓ, data.metadata)
levels = collect(skipmissing(newdata))
catdata = CategoricalVector{Union{Missing,T},R}(newdata, levels=levels)
return Arrow.DictEncoding{Union{Missing,V},S,typeof(catdata)}(id, catdata, isOrdered, metadata)
end
function Arrow.DictEncoding{Union{Missing,V},S,A}(id, data::Arrow.Primitive{U, B}, isOrdered, metadata) where {T, R, V<:CategoricalValue{T,R}, S, A, B, U}
newdata = Arrow.Primitive{Union{Missing,T}, B}(data.arrow, data.validity, data.data, data.ℓ, data.metadata)
levels = collect(skipmissing(newdata))
catdata = CategoricalVector{Union{Missing,T},R}(newdata, levels=levels)
return Arrow.DictEncoding{Union{Missing,V},R,typeof(catdata)}(id, catdata, isOrdered, metadata)
end
function Base.copy(x::Arrow.DictEncoded{V}) where {T, R, V<:CategoricalValue{T, R}}
pool = CategoricalArrays.CategoricalPool{T, R}(x.encoding.data)
inds = x.indices
refs = similar(inds, R)
refs .= inds .+ one(R)
return CategoricalVector{T}(refs, pool)
end
function Base.copy(x::Arrow.DictEncoded{Union{Missing, V}}) where {T, R, V<:CategoricalValue{T, R}}
levels = collect(skipmissing(x.encoding.data))
pool = CategoricalArrays.CategoricalPool{T, R}(levels)
inds = x.indices
refs = similar(inds, R)
if ismissing(x.encoding.data[1])
refs .= inds
elseif ismissing(x.encoding.data[end])
n = length(x.encoding.data) - 1
refs .= ifelse.(inds .== n, zero(R), inds .+ one(R))
else
throw(ErrorException("not implemented"))
end
return CategoricalVector{Union{Missing,T}}(refs, pool)
end
EDIT: I’ve improved the implementation a bit