Disclaimer: I am by no means an expert in Flux, MLJFlux, Julia or even machine learning. I may have done something very stupid without noticing.
I have a neural network classifier written in Flux. It works fine, it gives an accuracy of about 80/81% and gets trained in ~10 minutes on a dataset with 175.000 rows and 30 columns. The problem is that I can’t seem to reach an accuracy of 83+% that a collegue of mine can get using Python (tensorflow or sklearn’s MLPClassifier) on the same dataset. I’ve tinkered with the parameters a lot but I can’t reach the same accuracy. I thought that maybe the problem lies in the fact that Python uses L1 and L2 regulsarisations. But trying to modify the loss function in Flux.jl causes an error (“Mutating arrays are not allowed”, or something along those line). I’ve then added Optimiser(WeightDecay(λ), opt)
which, if I understood correclty, is a way to do L2 regulsarisation, but it didn’t change the accuracy I get.
Long story short, thanks to a user here that pointed me to it, I’ve decided to try MLJFlux.jl, which seems to bake everything under the hood, regulsarisation included. The problem is that a neural network classifier built using MLJFlux, with the same parameters as the one built in Flux, seems a lot slower to train, on the very same dataset, and it also seems to perform worst (but I may measure accuracy wrongly).
Running the Flux algorithm on the dataset with 175k rows for 100 epochs, takes about 3-5 minutes, and already it reaches an accuracy of 80%.
Running the MLJFlux algorithm on a limited dataset of 20k rows for 10 epochs (not 100, literally 10), with Holdout’s fraction_train = 0.7
, takes a lot longer.
Since I’ve had trouble reading and understaing the docs, I may very well made some mistake in trying to recreate the network I made in Flux.
Here is the code that uses Flux to build the neural network:
using DataFrames, DataFramesMeta, CSV, Alert, ProgressMeter, Plots, Flux
using Flux: onehotbatch, onecold, @epochs, Data.DataLoader, Optimiser
using Chain: @chain
using StatsBase: standardize, ZScoreTransform
using MLDataUtils: splitobs, shuffleobs
function build_model(input, layers, output; activation = relu, use_softmax = true, use_last_activation = false)
f = []
in_layer = input
for out_layer in layers
append!(f, [Dense(in_layer, out_layer, activation)])
in_layer = out_layer
end
if use_last_activation
append!(f, [Dense(in_layer, output, activation)])
else
append!(f, [Dense(in_layer, output)])
end
if use_softmax
append!(f, [softmax])
end
Chain(f...)
end
function score_accuracy(X_output, y_output, classes = [1, 0])
X = onecold(X_output, classes)
y = onecold(y_output, classes)
comparison = X .== y
return sum(a -> a > 0, comparison) / length(comparison)
end
function run_model(model, loss, opt, X, y, n_epochs, batchsize, λ = 0.0001)
ps = Flux.params(model)
loader = DataLoader(
(X, y),
batchsize = batchsize,
shuffle = true
)
tol_counter = 0
prev = loss(X_train, y_train)
loss_values = []
acc_train_values = []
p = Progress(n_epochs, dt = 1)
generate_showvalues(i, n) = () -> [(:current_epoch, i), (:tot_epochs, n)]
@alert "Training finished" for i in 1:n_epochs
Flux.train!(loss, ps, loader, Optimiser(WeightDecay(λ), opt))
l = loss(X_train, y_train)
append!(loss_values, l)
append!(acc_train_values, score_accuracy(model(X_train), y_train))
if abs(l - prev) < tol
tol_counter += 1
else
tol_counter = 0
end
if tol_counter == iter_tol
break
end
prev = l
ProgressMeter.next!(p; showvalues = generate_showvalues(i, n_epochs))
end
if tol_counter == iter_tol
@warn "Terminated due to having reached tol = $tol for $iter_tol times in a row"
end
return loss_values, acc_train_values
end
# PARAMETERS ------------------------------------------------------------------
dataset_frac = 0.7 # fraction of dataset to use for training
hidden_layers = [20, 10, 5]
optimizer = "adam" # "momentum"
loss(a, b) = Flux.Losses.crossentropy(model(a), b)
η = 1e-3 # Learning Rate (for both `Momentum` and `ADAM` optimizers)
ρ = 0.99 # Momentum (for `Momentum` optimizer)
β₁ = 0.9 # for `ADAM` optimizer
β₂ = 0.999 # for `ADAM` optimizer
λ = 0.0002 # L2 regularization term
batchsize = 200
tol = 1e-4
iter_tol = 10
n_epochs = 100
# ----------------------------------------------------------------------------
filename = raw"E:\Università\2020-2021\Applicazioni di Machine Learning\atlas_data.csv"
df, labels = @chain begin
CSV.read(filename, DataFrame)
@where(_, :KaggleSet .== "t")
select(_, Not([:Weight, :EventId, :KaggleSet, :KaggleWeight]))
select(_, Not(:Label)), @chain _ begin
select(_, :Label)
Flux.onehotbatch(_.Label, ["s", "b"])
end
end
N_input = length(names(df))
N_output = size(labels, 1)
opt_dict = Dict([
("momentum", Momentum(η, ρ)),
("adam", ADAM(η, (β₁, β₂)))
])
X = transpose(standardize(ZScoreTransform, Matrix(df)))
X_train, X_test = splitobs(X, at = dataset_frac)
y_train, y_test = splitobs(labels, at = dataset_frac)
model = build_model(N_input, hidden_layers, N_output)
losses, acc_train_values = @time run_model(
model,
loss,
opt_dict[optimizer],
X_train,
y_train,
n_epochs,
batchsize,
λ
)
acc_train = score_accuracy(model(X_train), y_train)
acc_test = score_accuracy(model(X_test), y_test)
println("Accuracy on training = $acc_train")
println("Accuracy on testing = $acc_test")
Here is the code that uses MLJFlux:
using MLJ
using Flux
import MLJFlux
using Random
using Chain: @chain
using DataFrames, DataFramesMeta, CSV
using Alert
using MLDataUtils: splitobs, shuffleobs
mutable struct MyNetwork{F <: Function} <: MLJFlux.Builder
layers :: Vector{Int64}
activation :: F
use_softmax :: Bool
use_last_activation :: Bool
end
function MLJFlux.build(nn::MyNetwork, n_in, n_out)
layers = nn.layers
activation = nn.activation
use_softmax = nn.use_softmax
use_last_activation = nn.use_last_activation
f = []
in_layer = n_in
for out_layer in layers
append!(f, [Dense(in_layer, out_layer, activation)])
in_layer = out_layer
end
if use_last_activation
append!(f, [Dense(in_layer, n_out, activation)])
else
append!(f, [Dense(in_layer, n_out)])
end
if use_softmax
append!(f, [softmax])
end
Chain(f...)
end
filename = raw"E:\Università\2020-2021\Applicazioni di Machine Learning\atlas_data.csv"
# PARAMETERS
hidden_layers = [20, 10, 5]
η = 1e-3
β₁ = 0.9
β₂ = 0.999
λ = 0.0002
α = 0.0
batchsize = 200
n_epochs = 10
df = @chain begin
CSV.read(filename, DataFrame)
@where(_, :KaggleSet .== "t")
select(_, Not([:Weight, :EventId, :KaggleSet, :KaggleWeight]))
end
# this is to limit the dataset at 10k rows
df = df[1:20000, :]
y, X = unpack(df, ==(:Label), colname -> true)
N = length(names(X))
X = coerce(X, Count => Continuous)
y = coerce(y, Multiclass)
# this is temporary
X_train = X
y_train = y
NeuralNetworkClassifier = @load NeuralNetworkClassifier
clf = NeuralNetworkClassifier(
builder = MyNetwork(
hidden_layers,
relu,
false,
false
),
finaliser = softmax,
optimiser = ADAM(η, (β₁, β₂), IdDict{Any,Any}()),
loss = Flux.crossentropy,
epochs = n_epochs,
batch_size = batchsize,
lambda = λ,
alpha = α,
optimiser_changes_trigger_retraining = false
)
mach = machine(clf, X_train, y_train)
ev = @alert "Finished" @time evaluate!(
mach,
resampling = Holdout(
rng = 123,
fraction_train = 0.7
),
operation = predict_mode,
measure = accuracy
)
The labels to classify are “s” or “b”. In Flux I transformed those into a Matrix of 0 and 1, while in MLJFlux I left them as is, and I just coerced the data. The compiler gives no warnings, so I suppose this is not the problem, but I may as well point out this difference. Another difference is that in MLJFlux I don’t standardize data, but again, this should matter.