Error when processing data from file for linear regression

Leticia-maria · June 3, 2023, 11:13pm

Dear all,
I am trying to run the following code:

I extract the data from a Matlab file
I split train/test
I try to run linear regression

the following issues:

I wanna reduce allocations
I wanna make It faster

using MAT
using LinearAlgebra
using Statistics
using GLM
using CSV
using DataFrames
using Distributed
using Pkg
using MLDataUtils

# Reading files
files = readdir()
filename = [file for file in files if endswith(file, ".mat")][1]
println("filename: $filename")

inputs = matread(filename)
basename = filename[1:end-4]
subsets = [Int.(_sets) for _sets in inputs["subsetTab"]]
splits = inputs["splits"]
numSplits = Int(inputs["numSplits"][1])
cnSel = Int.(inputs["cnSel"])
nnSel = Int.(inputs["nnSel"])

X = []
for i in cnSel
    temp = []
    for j in nnSel
        push!(temp, inputs["tabSel"][i, :, :][j, :])
    end
    push!(X, temp)
end

y = []
for i in cnSel
    push!(y, inputs["h2OrdS"][i, :, 3])
end

function train_test_split(mango, subset)
    I = Int.(splits[mango][1])
    J = Int.(splits[mango][2])
    K = Int.(splits[mango][3])
    L = Int.(splits[mango][4])
    train, test = [I, J], [K, L]

    X_train = []

    y_train = []
    y_test = []

    for i in train[1]
        set = []
        for j in train[2]
            push!(set, X[i][j][subset])
        end
        push!(X_train, set)
    end

    for i in train[1]
        set = []
        for j in train[2]
            push!(set, y[i][j])
        end
        push!(y_train, set)
    end

    for i in test[1]
        set = []
        for j in test[2]
            push!(set, y[i][j])
        end
        push!(y_test, set)
    end

    X_test = []

    for i in test[1]
        set = []
        for j in test[2]
            push!(set, X[i][j][subset])
        end
        push!(X_test, set)
    end
    println(size(y_train))

    X_train = transpose(hcat(X_train...))
    println(size(X_train))
    y_train = transpose(hcat(y_train...))
    println(y_train)

    X_test = transpose(hcat(X_test...))
    y_test = transpose(hcat(y_test...))
    return X_train, y_train, X_test, y_test
end

function linear_regression(mango, subset)
    X_train, y_train, X_test, y_test = train_test_split(mango, subset)
    array = zeros(6)
    model = lm(X_train, y_train)
    y_train_pred, y_test_pred = predict(model, X_train), predict(model, X_test)
    array[1] = r2(y_train, y_train_pred)
    array[2] = r2(y_test, y_test_pred)
    y_train_pred, y_test_pred = reshape(y_train_pred, a, b), reshape(y_test_pred, c, d)
    array[3] = r2(mean(y_train, dims=1), mean(y_train_pred, dims=1))
    array[4] = r2(mean(y_test, dims=1), mean(y_test_pred, dims=1))
    array[5] = r2(mean(y_train, dims=2), mean(y_train_pred, dims=2))
    array[6] = r2(mean(y_test, dims=2), mean(y_test_pred, dims=2))
    return array
end

function r2_median(subset)
    scores = [linear_regression(mango, subset) for mango in 1:numSplits]
    return median(hcat(scores...), dims=2)
end

@everywhere function job(job_k, _some_sets)
    _some_results = [r2_median(subset) for subset in _some_sets]
    return (job_k, _some_results)
end

function eval_subsets(_sets)
    num_sets = length(_sets)
    _dim = length(_sets[1])
    num_cores = nworkers()
    if num_sets < 50
        _results = job(0, _sets)
        _results = _results[2]
    else
        _set_length = ceil(length(_sets)/num_cores)
        _results = @distributed (vcat) for i in 1:num_cores
            job(i, _sets[(i-1)*_set_length+1:min(i*_set_length, end)])
        end
        _results = [_result for _result in _results if length(_result[2]) > 0]
        _results = vcat([_result[2] for _result in sort(_results, by = x -> x[1])]...)
    end
    new_name = basename * "_{$dim}_medR2.csv"
    CSV.write(new_name, DataFrame(_results))
end

for _sets in subsets
    eval_subsets(_sets)
end

the error

ERROR: LoadError: MethodError: no method matching fit(::Type{LinearModel}, ::Transpose{Float64, Matrix{Float64}}, ::Transpose{Float64, Matrix{Float64}}, ::Nothing)

Closest candidates are:
  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, !Matched::AbstractVector{<:Real}, ::Union{Nothing, Bool}; wts, dropcollinear)
   @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:134
  fit(::Type{T}, !Matched::FormulaTerm, ::Any, ::Any...; contrasts, kwargs...) where T<:RegressionModel
   @ StatsModels ~/.julia/packages/StatsModels/Wzvuu/src/statsmodel.jl:78
  fit(::Type{T}, !Matched::FormulaTerm, ::Any, ::Any...; contrasts, kwargs...) where T<:StatisticalModel
   @ StatsModels ~/.julia/packages/StatsModels/Wzvuu/src/statsmodel.jl:78
  ...

Stacktrace:
  [1] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}}, allowrankdeficient_dep::Nothing; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
  [2] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}}, allowrankdeficient_dep::Nothing)
    @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
  [3] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}})
    @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
  [4] linear_regression(mango::Int64, subset::Int64)
    @ Main ~/increase_dimensions/test.jl:98
  [5] #7
    @ ./none:0 [inlined]
  [6] iterate
    @ ./generator.jl:47 [inlined]
  [7] collect(itr::Base.Generator{UnitRange{Int64}, var"#7#8"{Int64}})
    @ Base ./array.jl:782
  [8] r2_median(subset::Int64)
    @ Main ~/increase_dimensions/test.jl:111
  [9] #9
    @ ./none:0 [inlined]
 [10] iterate
    @ ./generator.jl:47 [inlined]
 [11] collect(itr::Base.Generator{Matrix{Int64}, var"#9#10"})
    @ Base ./array.jl:782
 [12] job
    @ ~/increase_dimensions/test.jl:116 [inlined]
 [13] eval_subsets(_sets::Matrix{Int64})
    @ Main ~/increase_dimensions/test.jl:125
 [14] top-level scope
    @ ~/increase_dimensions/test.jl:140
in expression starting at /Users/leticiamadureira/increase_dimensions/test.jl:139

nilshg · June 4, 2023, 8:29am

Why do you think your code is slow/allocates too much? What level of performance were you expecting?

Your code is quite long, does different things, and isn’t reproducible as it relies on some Matlab file which only you have access to. If you want to make it easier for people to help you I would first of all work out which part of the code you think is slow and why you think it is slow (eg x times slower than Matlab/C/rust or whatever), and then make a reproducible example, which might mean writing a few lines of code to generate fake data for people to use.

Generally speaking though this pattern should be avoided in performance critical code:

Leticia-maria:

X_train = []

    y_train = []
    y_test = []

    for i in train[1]
        set = []
        for j in train[2]
            push!(set, X[i][j][subset])
        end
        push!(X_train, set)
    end

Here [] creates an array of type Any, which means the compiler can’t optimise the code it generates.

There’s more stuff in your code like all the transpose(hcat(x...)) which looks pretty Matlabby and is probably suboptimal, but it’s hard to say if and how that could be improved without some dummy data.

Leticia-maria · June 4, 2023, 2:31pm

I was unable to find the specific formatting of the inputs in the GLM package using the lm function to run the linear regression. This is the part I want to accelerate (the linear regression), but it is not running bc I am getting this error:

ERROR: LoadError: MethodError: no method matching fit(::Type{LinearModel}, ::Transpose{Float64, Matrix{Float64}}, ::Transpose{Float64, Matrix{Float64}}, ::Nothing)

Closest candidates are:
  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, !Matched::AbstractVector{<:Real}, ::Union{Nothing, Bool}; wts, dropcollinear)
   @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:134
  fit(::Type{T}, !Matched::FormulaTerm, ::Any, ::Any...; contrasts, kwargs...) where T<:RegressionModel
   @ StatsModels ~/.julia/packages/StatsModels/Wzvuu/src/statsmodel.jl:78
  fit(::Type{T}, !Matched::FormulaTerm, ::Any, ::Any...; contrasts, kwargs...) where T<:StatisticalModel
   @ StatsModels ~/.julia/packages/StatsModels/Wzvuu/src/statsmodel.jl:78
  ...

Stacktrace:
  [1] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}}, allowrankdeficient_dep::Nothing; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
  [2] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}}, allowrankdeficient_dep::Nothing)
    @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
  [3] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}})
    @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
  [4] linear_regression(mango::Int64, subset::Int64)
    @ Main ~/increase_dimensions/test.jl:98
  [5] #7
    @ ./none:0 [inlined]
  [6] iterate
    @ ./generator.jl:47 [inlined]
  [7] collect(itr::Base.Generator{UnitRange{Int64}, var"#7#8"{Int64}})
    @ Base ./array.jl:782
  [8] r2_median(subset::Int64)
    @ Main ~/increase_dimensions/test.jl:111
  [9] #9
    @ ./none:0 [inlined]
 [10] iterate
    @ ./generator.jl:47 [inlined]
 [11] collect(itr::Base.Generator{Matrix{Int64}, var"#9#10"})
    @ Base ./array.jl:782
 [12] job
    @ ~/increase_dimensions/test.jl:116 [inlined]
 [13] eval_subsets(_sets::Matrix{Int64})
    @ Main ~/increase_dimensions/test.jl:125
 [14] top-level scope
    @ ~/increase_dimensions/test.jl:140
in expression starting at /Users/leticiamadureira/increase_dimensions/test.jl:139

kellertuer · June 4, 2023, 3:07pm

Since I can not run the code, I can maybe at least provide an idea. it seems within r2_median you have the line

    scores = [linear_regression(mango, subset) for mango in 1:numSplits]

which in the end calls the fit function it seems, but with transposes of matrices, not with matrices.
When you work with transposes, this often does not make much a difference, but it seems the fit function requires matrices.

So take your transpose matrix, and force it to be a matrix again, see

 A = transpose(zeros(3,3)) ' is a transpose (i.e. does not allocate memory and reuses the memory of the zeros
B =  Matrix(A) # is a matrix (allocates memory), but might be necessary in your code

I was not directly able to pin point where that happens in your code, but you have several transposes, that might cause this.

nilshg · June 4, 2023, 5:35pm

That’s why I made the point earlier that it is helpful to keep your questions narrowly focused and provide working examples. There are certainly nice people on here like Ronny who will try to guess at a solution from non-running code, but you’re making their life harder and decrease the likelihood of getting a useful answer.

In this case however it is possible to work out what’s wrong from the stack trace, the clue is here:

Closest candidates are:
  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, !Matched::AbstractVector{<:Real}, ::Union{Nothing, Bool};

this tells you that the error is the second input into lm (which under the hood calls fit with LinearModel as the first argument) - the !Matched::AbstractVector{<:Real} says the closest available method to what you are trying to call needs a vector as an argument where you provided a matrix.

An MWE is:

julia> using GLM

julia> lm(rand(2, 2), rand(2, 1))
ERROR: MethodError: no method matching fit(::Type{LinearModel}, ::Matrix{Float64}, ::Matrix{Float64}, ::Nothing)

Closest candidates are:
  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, ::AbstractVector{<:Real}, ::Union{Nothing, Bool}; wts, dropcollinear)
   @ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:134
  fit(::Type{D}, ::Any...) where D<:Distributions.Distribution
   @ Distributions ~/.julia/packages/Distributions/uREy8/src/genericfit.jl:34
  ...

julia> lm(rand(2, 2), rand(2))
LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}:

Coefficients:
────────────────────────────────────────────────────────────────
        Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
────────────────────────────────────────────────────────────────
x1  -0.513871         Inf  -0.00    1.0000       -Inf        Inf
x2   0.556908         Inf   0.00    1.0000       -Inf        Inf
────────────────────────────────────────────────────────────────

here rand(2,1) produces a 2-by-1 matrix, while rand(2) is a vector. The left hand side variable of the regression has to be a vector in GLM, and all your transpose(hcat(...)) business creates matrices where you shouldn’t be using matrices.

kellertuer · June 4, 2023, 5:46pm

Indeed, your analysis is more precise than mine, that is more probably the actual reason for the error.

Leticia-maria · June 5, 2023, 1:02am

Thank you so much for the help! And I apologize for the topic I have opened here if it was not following what was expected.

kellertuer · June 5, 2023, 6:22am

Oh, you do not have to apologise, sometimes it is indeed complicated to narrow things down to an MnWE (Minimal non-Working Example); but in my experience, trying to narrow down your error to something like 10-20 lines of codes is also helpful to describe where the error probably is.
I even had a few problems by now, where I found the solution while writing the forum post.

And – compared to Nils and my more like “guess-work” what the error might be – having 20 lines I can copy and run and see the error myself makes it easier to help.

Besides that – great that the tips from Nils helped!

Topic		Replies	Views
Accelerating linear methods Machine Learning question	11	689	June 3, 2023
Parallel/multithreading version of linear regression Performance performance , parallel , multithreading , distributed , regression	19	1963	October 28, 2020
Why is my code very slow General Usage performance	16	679	June 23, 2023
Three million linear regressions Performance	4	464	May 23, 2022
Error in mul!(c,a,b) if size(b) = (n,1) General Usage	28	1524	October 10, 2020

Error when processing data from file for linear regression

Related topics