Here is my code to train all 49 regression models to predict house prices in Boston.
@tlienart helped w/ training all MLJ models
@mthelm85 helped w/ one_hot_encode
Functions to clean data, load & train all models:
Code
#load packages
using MLJ, RDatasets, TableView, DataFrames
################################################################################
#OHE/AZ(X)/load_m/train_m
#OHE
function one_hot_encode(d::DataFrame)
encoded = DataFrame()
for col in names(d), val in unique(d[!, col])
lab = string(col) * "_" * string(val)
encoded[!, Symbol(lab) ] = ifelse.(d[!, col] .== val, 1, 0)
end
return encoded
end
#AZ: convert Strings & Count to OHE.
function AZ(X)
sch = schema(X);
#ty = [CategoricalString{UInt8}, CategoricalString{UInt32}, CategoricalValue{Int64,UInt32}]
tn = [Int, Float16, Float32, Float64]
vs = [];
for (name, type) in zip(sch.names, sch.types)
if type ∉ tn #∈ ty #∉ [Int32, Int64, Float64]
#println(:($name) , " ", type)
push!(vs, :($name) )
#global X = coerce(X, :($name) =>Continuous);
end
end
#
Xd= DataFrame(X);
X_ohe = one_hot_encode( Xd[:, vs] )
Xd = hcat( X_ohe, select(Xd, Not( vs )) )
Xd = coerce(Xd, autotype(Xd, :discrete_to_continuous))
#sch= schema(Xd);
#@show sch.scitypes;
#
X=Xd
return X
end
#Load & make model list.
@inline function load_m(model_list)
model_names = Vector{String}(undef, length(model_list))
@inbounds for (i, model) in enumerate(model_list)
#load(model.name, pkg=model.package_name)
load(model.name, pkg=model.package_name, verbosity=0) #
#@load model pkg=model.package_name verbosity=1
model_names[i] = model.name
end
return model_names
end
#Train & Score.
#NOTE: if we do target engineering we need to transform Y back to compare score.
@inline function train_m(m::String, X, y, train, test, pr, meas; invtrans=identity)
t1 = time_ns()
println(m)
if m =="XGBoostRegressor"
mdl = eval(Meta.parse("$(m)(num_round=500)"))
elseif m=="EvoTreeRegressor"
mdl = eval(Meta.parse("$(m)(nrounds = 1500)"))
else
mdl = eval(Meta.parse("$(m)()"))
end
#
mach = machine(mdl, X, y)
fit!(mach, rows=train, verbosity=0) #, verbosity=0
#ŷ = MLJ.pr(mach, rows=test)
ŷ = pr(mach, rows=test)
ŷ = invtrans.(ŷ)
y = invtrans.(y)
#AZ Custom oos-R2
if meas=="Rsq"
ê = (ŷ-y[test]) #sse=ê'ê;
ẽ = ( mean(y[train]) .- y[test] )
R2 = ( 1 - ( (ê'ê)/(ẽ'ẽ) ) )*100
s = R2
elseif meas==rmsl
s = meas(abs.(ŷ), abs.(y[test]) ) #abs.() for rmsl AMES.
else
s = meas(ŷ, y[test])
end
t2 = time_ns()
return [round(s, sigdigits=5), round((t2-t1)/1.0e9, sigdigits=5)]
end
Applied to the Boston housing data:
Code
X, y = @load_boston;
train, test = partition(eachindex(y), .7, rng=333);
X = AZ(X)
m_match = models(matching(X, y), x -> x.prediction_type == :deterministic);
m_names = load_m(m_match);
#
sc = [train_m(m, X, y, train, test, predict, rms) for m in m_names]
sc =hcat(sc...)';
showtable( hcat(
m_names[sortperm(sc[:,1])] ,
sc[sortperm(sc[:,1]), :]
) )
#
sc = [train_m(m, X, log.(y), train, test, predict, rms, invtrans=exp) for m in m_names]
sc =hcat(sc...)';
showtable( hcat(
m_names[sortperm(sc[:,1])] ,
sc[sortperm(sc[:,1]), :]
) )
#
sc = [train_m(m, log.(X.+1), y, train, test, predict, rms) for m in m_names]
sc =hcat(sc...)';
showtable( hcat(
m_names[sortperm(sc[:,1])] ,
sc[sortperm(sc[:,1]), :]
) )
#
sc = [train_m(m, log.(X.+1), log.(y), train, test, predict, rms, invtrans=exp) for m in m_names]
sc =hcat(sc...)';
showtable( hcat(
m_names[sortperm(sc[:,1])] ,
sc[sortperm(sc[:,1]), :]
) )
#
Note a lot more can be done:
-
Moar Models (not currently in MLJ):
MLJFlux.jl (WIP)
@xiaodai’s JLBoost.jl, @joshday’s SparseRegression.jl, @rakeshvar’s AnyBoost.jl
Also the MLJ Roadmap mentions: Turing.jl, Gen.jl, Soss.jl -
HP tuning (I currently use default HP grids):
MLJTuning.jl (WIP) looks promising.
I’d love to use @baggepinnen’s Hyperopt.jl to automatically tune all models w/ Bayesian optimization. -
Ensembling:
MLJ has nice options for ensembles I’d like to automate for a large number of models.
In addition @ppalmes’s AutoMLPipelines.jl is amazing (see discussion).
In addition to the Boston data I’ve run all models on:
Regression: Ames Iowa housing/Diamonds/King County Housing
Classification: Crabs/Iris/Titanic/Pima
MLJ community: please let me know if this works or if you have any feedback
Paulito, would it be possible to use your package to automatically train every model on the Boston housing data? And consider some stacked ensembles of the set of models?