I need to reshape large dataframes from Wide to Long format.
The input looks like this, with a unique ID per row, and multiple columns with a date in the name and multiple columns without the date.
# R code
my <- data.table(ID=1:5, AA_2001=rnorm(5), BB_2001=rnorm(5), BB_2002=rnorm(5), CC_2001=rnorm(5), CC_2003=rnorm(5), XX=sample(5,5))
# Julia Code
WideDF1 = DataFrame(ID=1:5,AA_2001=randn(5), BB_2001=randn(5), BB_2002=randn(5), CC_2001=randn(5), CC_2003=randn(5), XX=rand(1:5,5))
ID AA_2001 BB_2001 BB_2002 CC_2001 CC_2003 XX
1 1.6045664 -1.0564860 0.4180849 2.05674938 -0.2889240 4
2 0.2872760 0.6051825 -0.9855341 -1.07268299 -0.4726669 2
3 -0.6698151 0.8730640 -0.7799342 -1.42078540 0.7218626 5
4 1.1806024 1.0190259 -1.5307503 -0.01904991 -1.0250547 1
5 -0.3484850 0.7155088 -0.8453928 -1.10998317 1.4003273 3
And I want to convert it to:
ID XX year AA BB CC
1 4 2001 1.6045664 -1.0564860 2.05674938
1 4 2002 NA 0.4180849 NA
1 4 2003 NA NA -0.28892400
2 2 2001 0.2872760 0.6051825 -1.07268299
2 2 2002 NA -0.9855341 NA
2 2 2003 NA NA -0.47266691
3 5 2001 -0.6698151 0.8730640 -1.42078540
3 5 2002 NA -0.7799342 NA
3 5 2003 NA NA 0.72186256
4 1 2001 1.1806024 1.0190259 -0.01904991
4 1 2002 NA -1.5307503 NA
4 1 2003 NA NA -1.02505470
5 3 2001 -0.3484850 0.7155088 -1.10998317
5 3 2002 NA -0.8453928 NA
5 3 2003 NA NA 1.40032729
I was doing it with R. But the real data is about 1GB or more, you can load it but reshaping it produces memory errors. Then I have had to split it, reshaped each chunk and merged them back. With this method I can achieve it but itβs too slow.
Now Iβm trying to move to Julia.
In order to compare times and memory I first show you two codes in R:
# Direct version but slow and needs a lot of memory.
RlongA <- function(my) {
idvars <- grep("_20[0-9][0-9]$",names(my) , invert = TRUE, value=T)
temp <- melt(my, id.vars = idvars)
temp[, `:=`(var = sub("_20[0-9][0-9]$", '', variable), yyear = sub('.*_', '', variable), variable = NULL)]
temp[,var:=factor(var, levels=unique(var))]
return(dcast( temp, ... ~ var, value.var='value' ))
}
# Alternative, less memory.
# This method keeps the original column order
RLongB <- function(my) {
mcols <- grep("_20[0-9][0-9]$",names(my), value = T ) # columns to be reshaped
temp <- melt(my, id.vars = "ID", measure.vars = mcols) # melt with only one column ID
temp[, c("variable", "yyear") := tstrsplit(variable, "_")] # split column names
temp[, variable:=factor(variable, levels=unique(variable))] # coerce names to factors in the same order as my
return(my[,.SD, .SDcols=-mcols][dcast(temp, ... ~ variable), on = "ID"]) # final dcast and right join !!!
}
Iβm new to Julia but I got to badly translate it to Julia.
function JLongA(Wide)
tempnames=names(Wide)
scols=tempnames[Not(occursin.(r"_20[0-9][0-9]$",tempnames))]
tempDF=melt(Wide, scols)
tempDF=hcat(tempDF, DataFrame(reduce(vcat, permutedims.(split.(tempDF.variable, '_'))), [:name, :year]))
tempDF=select!(tempDF, Not(:variable))
unstack(tempDF, vcat(scols, "year"), :name, :value)
end
# Alternative
function JLongB(Wide)
tempnames=names(Wide)
mcols=tempnames[occursin.(r"_20[0-9][0-9]$",tempnames)]
scols=tempnames[Not(occursin.(r"_20[0-9][0-9]$",tempnames))]
tempDF=stack(Wide, mcols, :ID)
tempDF=hcat(tempDF, DataFrame(reduce(vcat, permutedims.(split.(tempDF.variable, '_'))), [:name, :year]))
tempDF=select!(tempDF, Not(:variable))
tempDF2=unstack(tempDF, [:ID, :year], :name, :value)
return innerjoin(tempDF2, Wide[!,scols], on=:ID)
end
In order to benchmark it I have also created a larger toy example:
# R
nn <- 200 # cols
mm <- 200 # rows
WideDF <- as.data.table(matrix(rnorm(2*nn*mm),mm,2*nn))
nam1 <- paste0(stri_rand_strings(nn, 6, pattern = "[a-z]"),"_",sample(2000:2010,nn,T) )
nam2 <- paste0("Z", stri_rand_strings(nn, 6, pattern = "[a-z]") )
setnames(WideDF,c(nam1,nam2))
WideDF[,ID:=1:mm]
#Julia Version.
NN=200
MM=200
# Be careful with possible duplicated names, because they are generated randomly.
nam=vcat([randstring('a':'z',6) for _ β 1:NN] .* '_' .* string.(sample(2000:2010, NN)), 'Z' .* [randstring('a':'z',6) for _ β 1:(NN)])
WideDF2=DataFrame( randn(MM, 2*NN), nam)
insert!(WideDF2, 1,1:MM, :ID )
With nn=mm=200 and the βdirect codeβ my computer needs 1.3sec and 80MB in R, and 23sec and 14GB in Julia.
How can I improve it?
The Juliaβs βdirect codeβ canβt be applied with larger datasets in my computer.
And with nn=mm=200 and the βalternative codeβ it needs 100ms and 22MB in R and 29ms and 43MB in Julia.
How can I improve it?
With the βalternative codeβ with nn=2000 and mm=2000 then R needs 6 secs and 2.1GB, and JUlia 4.6sec and 3.9GB. The direct code doesnβt work here.
But what I really need to do is nn=2000 and mm=200000. The problem is memory, not time.
I think the best option would be to do it using a database program or something like JuliaDB directly on disk, but they donβt seem to provide this commands.