It’s not a big problem to make everything manually, but that wouldn’t save you from the lack of memory.
Here is somewhat hairy code
using BenchmarkTools
using DataFrames
using BenchmarkTools
using Random
using StableRNGs
using StatsBase
function gen_example(; NN = 200, MM = 200, rng = Random.GLOBAL_RNG)
# Be careful with possible duplicated names, because they are generated randomly.
nam = vcat([randstring(rng, 'a':'z',6) for _ ∈ 1:NN] .* '_' .* string.(sample(rng, 2000:2010, NN)), 'Z' .* [randstring(rng, 'a':'z',6) for _ ∈ 1:(NN)])
WideDF2 = DataFrame( randn(rng, MM, 2*NN), nam)
insert!(WideDF2, 1,1:MM, :ID )
return WideDF2
end
function repin(v, n)
w = Vector{eltype(v)}(undef, length(v) * n)
idx = 1
@inbounds @simd for x in v
for j in 1:n
w[idx] = x
idx += 1
end
end
return w
end
function repout(v, n)
w = Vector{eltype(v)}(undef, length(v) * n)
idx = 1
@inbounds @simd for j in 1:n
for x in v
w[idx] = x
idx += 1
end
end
return w
end
function repmerge(v1, v2, val)
w = Vector{Union{eltype(v1), Missing}}(undef, length(v1)*length(v2))
idx = 1
@inbounds @simd for v in v1
for x in v2
w[idx] = x == val ? v : missing
idx += 1
end
end
return w
end
function repmerge!(w, v1, v2, val)
idx = 1
@inbounds @simd for v in v1
for x in v2
if x == val
w[idx] = v
end
idx += 1
end
end
return w
end
function JLongC(df)
tempnames = names(df)
mcols=tempnames[occursin.(r"_20[0-9][0-9]$",tempnames)]
scols=tempnames[Not(occursin.(r"_20[0-9][0-9]$",tempnames))]
splitted = map(x -> (x[1], parse(Int, x[2])), split.(mcols, "_"))
years = unique(map(x -> x[2], splitted))
resdf = DataFrame(scols[1] => repin(df[!, scols[1]], length(years)))
for i in 2:length(scols)
resdf[scols[i]] = repin(df[!, scols[i]], length(years))
end
resdf[:year] = repout(years, size(df, 1))
for i in 1:length(mcols)
colname, year = splitted[i]
origname = mcols[i]
if colname in names(resdf)
repmerge!(resdf[!, colname], df[!, origname], years, year)
else
resdf[colname] = repmerge(df[!, origname], years, year)
end
end
resdf
end
df = gen_example(NN = 200, MM = 200);
julia> @btime JLongB($df);
30.448 ms (515721 allocations: 49.56 MiB)
julia> @btime JLongC($df);
7.081 ms (306583 allocations: 23.59 MiB)
I think, you can try to wrap all vectors in Mmap, maybe this will help you to overcome memory problem.