Memory efficient melting and stacking? Reshaping Wide to Long

It’s not a big problem to make everything manually, but that wouldn’t save you from the lack of memory.

Here is somewhat hairy code

using BenchmarkTools
using DataFrames
using BenchmarkTools
using Random
using StableRNGs
using StatsBase

function gen_example(; NN = 200, MM = 200, rng = Random.GLOBAL_RNG)
    # Be careful with possible duplicated names, because they are generated randomly.
    nam = vcat([randstring(rng, 'a':'z',6) for _ ∈ 1:NN] .* '_' .* string.(sample(rng, 2000:2010, NN)), 'Z' .* [randstring(rng, 'a':'z',6) for _ ∈ 1:(NN)])
    WideDF2 = DataFrame( randn(rng, MM, 2*NN), nam)
    insert!(WideDF2, 1,1:MM, :ID )

    return WideDF2
end

function repin(v, n)
    w = Vector{eltype(v)}(undef, length(v) * n)
    idx = 1
    @inbounds @simd for x in v
        for j in 1:n
            w[idx] = x
            idx += 1
        end
    end
    return w
end

function repout(v, n)
    w = Vector{eltype(v)}(undef, length(v) * n)
    idx = 1
    @inbounds @simd for j in 1:n
        for x in v
            w[idx] = x
            idx += 1
        end
    end
    return w
end

function repmerge(v1, v2, val)
    w = Vector{Union{eltype(v1), Missing}}(undef, length(v1)*length(v2))
    idx = 1
    @inbounds @simd for v in v1
        for x in v2
            w[idx] = x == val ? v : missing
            idx += 1
        end
    end
    return w
end

function repmerge!(w, v1, v2, val)
    idx = 1
    @inbounds @simd for v in v1
        for x in v2
            if x == val
                w[idx] = v
            end
            idx += 1
        end
    end
    return w
end

function JLongC(df)
    tempnames = names(df)
    mcols=tempnames[occursin.(r"_20[0-9][0-9]$",tempnames)]
    scols=tempnames[Not(occursin.(r"_20[0-9][0-9]$",tempnames))]
    splitted = map(x -> (x[1], parse(Int, x[2])), split.(mcols, "_"))
    years = unique(map(x -> x[2], splitted))

    resdf = DataFrame(scols[1] => repin(df[!, scols[1]], length(years)))
    for i in 2:length(scols)
        resdf[scols[i]] = repin(df[!, scols[i]], length(years))
    end
    resdf[:year] = repout(years, size(df, 1))
    for i in 1:length(mcols)
        colname, year = splitted[i]
        origname = mcols[i]
        if colname in names(resdf)
            repmerge!(resdf[!, colname], df[!, origname], years, year)
        else
            resdf[colname] = repmerge(df[!, origname], years, year)
        end
    end
    resdf
end

df = gen_example(NN = 200, MM = 200);

julia> @btime JLongB($df);
  30.448 ms (515721 allocations: 49.56 MiB)

julia> @btime JLongC($df);
  7.081 ms (306583 allocations: 23.59 MiB)

I think, you can try to wrap all vectors in Mmap, maybe this will help you to overcome memory problem.

1 Like