I have this code in FixedEffectModels where a function takes a Dataframe as argument and returns a PooledDataArray. The goal of this function is to create a vector that indexes groups defined by different combination of vector values. For observations where one of the value was missing, the corresponding vector has a missing value. So for instance, we have
group([1, 2, 3], [1, 2, 3]) = [1, 2, 3]
group([4, 5, 6], [7, 8, 9]) = [1, 2, 3]
group([4, 5, 6], [NA, 8, 9]) = [NA, 1, 2]
I don’t know how to translate the code to CategoricalArray. Could someone help me to translate it?
function group(x::AbstractVector)
v = PooledDataArray(x)
PooledDataArray(RefArray(v.refs), collect(1:length(v.pool)))
end
function pool_combine!(x::Array{UInt64, T}, dv::PooledDataVector, ngroups::Integer) where {T}
@inbounds for i in 1:length(x)
# if previous one is NA or this one is NA, set to NA
x[i] = (dv.refs[i] == 0 || x[i] == zero(UInt64)) ? zero(UInt64) : x[i] + (dv.refs[i] - 1) * ngroups
end
return(x, ngroups * length(dv.pool))
end
function group(df::AbstractDataFrame)
isempty(df) && throw("df is empty")
ncols = size(df, 2)
v = df[1]
ncols = size(df, 2)
ncols == 1 && return(group(v))
if typeof(v) <: PooledDataVector
x = convert(Array{UInt64}, v.refs)
else
v = PooledDataArray(v, v.na, UInt64)
x = v.refs
end
ngroups = length(v.pool)
for j = 2:ncols
v = PooledDataArray(df[j])
(x, ngroups) = pool_combine!(x, v, ngroups)
end
return(factorize!(x))
end
function reftype(sz)
sz <= typemax(UInt8) ? UInt8 :
sz <= typemax(UInt16) ? UInt16 :
sz <= typemax(UInt32) ? UInt32 :
UInt64
end
function factorize!(refs::Array)
uu = unique(refs)
sort!(uu)
has_na = uu[1] == 0
T = reftype(length(uu)-has_na)
dict = Dict{eltype(refs), T}(zip(uu, (1-has_na):convert(T, length(uu)-has_na)))
@inbounds @simd for i in 1:length(refs)
refs[i] = dict[refs[i]]
end
PooledDataArray(RefArray(refs), collect(1:(length(uu)-has_na)))
end