Dear all,
I am trying to run the following code:
- I extract the data from a Matlab file
- I split train/test
- I try to run linear regression
the following issues:
- I wanna reduce allocations
- I wanna make It faster
using MAT
using LinearAlgebra
using Statistics
using GLM
using CSV
using DataFrames
using Distributed
using Pkg
using MLDataUtils
# Reading files
files = readdir()
filename = [file for file in files if endswith(file, ".mat")][1]
println("filename: $filename")
inputs = matread(filename)
basename = filename[1:end-4]
subsets = [Int.(_sets) for _sets in inputs["subsetTab"]]
splits = inputs["splits"]
numSplits = Int(inputs["numSplits"][1])
cnSel = Int.(inputs["cnSel"])
nnSel = Int.(inputs["nnSel"])
X = []
for i in cnSel
temp = []
for j in nnSel
push!(temp, inputs["tabSel"][i, :, :][j, :])
end
push!(X, temp)
end
y = []
for i in cnSel
push!(y, inputs["h2OrdS"][i, :, 3])
end
function train_test_split(mango, subset)
I = Int.(splits[mango][1])
J = Int.(splits[mango][2])
K = Int.(splits[mango][3])
L = Int.(splits[mango][4])
train, test = [I, J], [K, L]
X_train = []
y_train = []
y_test = []
for i in train[1]
set = []
for j in train[2]
push!(set, X[i][j][subset])
end
push!(X_train, set)
end
for i in train[1]
set = []
for j in train[2]
push!(set, y[i][j])
end
push!(y_train, set)
end
for i in test[1]
set = []
for j in test[2]
push!(set, y[i][j])
end
push!(y_test, set)
end
X_test = []
for i in test[1]
set = []
for j in test[2]
push!(set, X[i][j][subset])
end
push!(X_test, set)
end
println(size(y_train))
X_train = transpose(hcat(X_train...))
println(size(X_train))
y_train = transpose(hcat(y_train...))
println(y_train)
X_test = transpose(hcat(X_test...))
y_test = transpose(hcat(y_test...))
return X_train, y_train, X_test, y_test
end
function linear_regression(mango, subset)
X_train, y_train, X_test, y_test = train_test_split(mango, subset)
array = zeros(6)
model = lm(X_train, y_train)
y_train_pred, y_test_pred = predict(model, X_train), predict(model, X_test)
array[1] = r2(y_train, y_train_pred)
array[2] = r2(y_test, y_test_pred)
y_train_pred, y_test_pred = reshape(y_train_pred, a, b), reshape(y_test_pred, c, d)
array[3] = r2(mean(y_train, dims=1), mean(y_train_pred, dims=1))
array[4] = r2(mean(y_test, dims=1), mean(y_test_pred, dims=1))
array[5] = r2(mean(y_train, dims=2), mean(y_train_pred, dims=2))
array[6] = r2(mean(y_test, dims=2), mean(y_test_pred, dims=2))
return array
end
function r2_median(subset)
scores = [linear_regression(mango, subset) for mango in 1:numSplits]
return median(hcat(scores...), dims=2)
end
@everywhere function job(job_k, _some_sets)
_some_results = [r2_median(subset) for subset in _some_sets]
return (job_k, _some_results)
end
function eval_subsets(_sets)
num_sets = length(_sets)
_dim = length(_sets[1])
num_cores = nworkers()
if num_sets < 50
_results = job(0, _sets)
_results = _results[2]
else
_set_length = ceil(length(_sets)/num_cores)
_results = @distributed (vcat) for i in 1:num_cores
job(i, _sets[(i-1)*_set_length+1:min(i*_set_length, end)])
end
_results = [_result for _result in _results if length(_result[2]) > 0]
_results = vcat([_result[2] for _result in sort(_results, by = x -> x[1])]...)
end
new_name = basename * "_{$dim}_medR2.csv"
CSV.write(new_name, DataFrame(_results))
end
for _sets in subsets
eval_subsets(_sets)
end
the error
ERROR: LoadError: MethodError: no method matching fit(::Type{LinearModel}, ::Transpose{Float64, Matrix{Float64}}, ::Transpose{Float64, Matrix{Float64}}, ::Nothing)
Closest candidates are:
fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, !Matched::AbstractVector{<:Real}, ::Union{Nothing, Bool}; wts, dropcollinear)
@ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:134
fit(::Type{T}, !Matched::FormulaTerm, ::Any, ::Any...; contrasts, kwargs...) where T<:RegressionModel
@ StatsModels ~/.julia/packages/StatsModels/Wzvuu/src/statsmodel.jl:78
fit(::Type{T}, !Matched::FormulaTerm, ::Any, ::Any...; contrasts, kwargs...) where T<:StatisticalModel
@ StatsModels ~/.julia/packages/StatsModels/Wzvuu/src/statsmodel.jl:78
...
Stacktrace:
[1] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}}, allowrankdeficient_dep::Nothing; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
[2] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}}, allowrankdeficient_dep::Nothing)
@ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
[3] lm(X::Transpose{Float64, Matrix{Float64}}, y::Transpose{Float64, Matrix{Float64}})
@ GLM ~/.julia/packages/GLM/6ycOV/src/lm.jl:157
[4] linear_regression(mango::Int64, subset::Int64)
@ Main ~/increase_dimensions/test.jl:98
[5] #7
@ ./none:0 [inlined]
[6] iterate
@ ./generator.jl:47 [inlined]
[7] collect(itr::Base.Generator{UnitRange{Int64}, var"#7#8"{Int64}})
@ Base ./array.jl:782
[8] r2_median(subset::Int64)
@ Main ~/increase_dimensions/test.jl:111
[9] #9
@ ./none:0 [inlined]
[10] iterate
@ ./generator.jl:47 [inlined]
[11] collect(itr::Base.Generator{Matrix{Int64}, var"#9#10"})
@ Base ./array.jl:782
[12] job
@ ~/increase_dimensions/test.jl:116 [inlined]
[13] eval_subsets(_sets::Matrix{Int64})
@ Main ~/increase_dimensions/test.jl:125
[14] top-level scope
@ ~/increase_dimensions/test.jl:140
in expression starting at /Users/leticiamadureira/increase_dimensions/test.jl:139