Here is a MVE that I created on top of the following example: model-zoo/vision/conv_mnist at master · FluxML/model-zoo · GitHub
exec julia --optimize=3 --threads=6 "${BASH_SOURCE[0]}" "$@"
using Flux
using Flux.Data: DataLoader
using Flux.Optimise: Optimiser, WeightDecay
using Flux: onehotbatch, onecold, flatten
using Flux.Losses: logitcrossentropy
using Statistics, Random
using Logging: with_logger
using ProgressMeter: @showprogress
import MLDatasets
import BSON
using CUDA
# We set default values for the arguments for the function `train`:
Base.@kwdef mutable struct Args
η = 3e-4 ## learning rate
λ = 0 ## L2 regularizer param, implemented as weight decay
batchsize = 128 ## batch size
epochs = 50 ## number of epochs
seed = 0 ## set seed > 0 for reproducibility
use_cuda = true ## if true use cuda (if available)
infotime = 1 ## report every `infotime` epochs
checktime = 5 ## Save the model every `checktime` epochs. Set to 0 for no checkpoints.
savepath = "runs/" ## results path
# ## Data
# We create the function `get_data` to load the MNIST train and test data from [MLDatasets]( and reshape them so that they are in the shape that Flux expects.
function get_data(args)
xtrain, ytrain = MLDatasets.MNIST(:train)[:]
xtest, ytest = MLDatasets.MNIST(:test)[:]
xtrain = reshape(xtrain, 28, 28, 1, :)
xtest = reshape(xtest, 28, 28, 1, :)
ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)
train_loader = DataLoader((xtrain, ytrain), batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader((xtest, ytest), batchsize=args.batchsize)
return train_loader, test_loader
# The function `get_data` performs the following tasks:
# * **Loads MNIST dataset:** Loads the train and test set tensors. The shape of the train data is `28x28x60000` and the test data is `28x28x10000`.
# * **Reshapes the train and test data:** Notice that we reshape the data so that we can pass it as arguments for the input layer of the model.
# * **One-hot encodes the train and test labels:** Creates a batch of one-hot vectors so we can pass the labels of the data as arguments for the loss function. For this example, we use the [logitcrossentropy]( function and it expects data to be one-hot encoded.
# * **Creates mini-batches of data:** Creates two DataLoader objects (train and test) that handle data mini-batches of size `128 ` (as defined above). We create these two objects so that we can pass the entire data set through the loss function at once when training our model. Also, it shuffles the data points during each iteration (`shuffle=true`).
# ## Model
# We create the LeNet5 "constructor". It uses Flux's built-in [Convolutional and pooling layers](
function LeNet5(; imgsize=(28, 28, 1), nclasses=10)
out_conv_size = (imgsize[1] ÷ 4 - 3, imgsize[2] ÷ 4 - 3, 16)
return [Chain(
Conv((5, 5), imgsize[end] => 6, relu),
MaxPool((2, 2)),
Conv((5, 5), 6 => 16, relu),
MaxPool((2, 2)),
Dense(prod(out_conv_size), 120, relu),
Dense(120, 84, relu),
Dense(84, nclasses)
) for i in 1:5]
# ## Loss function
# We use the function [logitcrossentropy]( to compute the difference between
# the predicted and actual values (loss).
loss(ŷ, y) = logitcrossentropy(ŷ, y)
# Also, we create the function `eval_loss_accuracy` to output the loss and the accuracy during training:
function eval_loss_accuracy(loader, model, device)
l = 0.0f0
acc = 0
ntot = 0
for (x, y) in loader
x, y = x |> device, y |> device
ŷ = model(x)
l += loss(ŷ, y) * size(x)[end]
acc += sum(onecold(ŷ |> cpu) .== onecold(y |> cpu))
ntot += size(x)[end]
return (loss=l / ntot |> round4, acc=acc / ntot * 100 |> round4)
# ## Utility functions
# We need a couple of functions to obtain the total number of the model's parameters. Also, we create a function to round numbers to four digits.
num_params(model) = sum(length, Flux.params(model))
round4(x) = round(x, digits=4)
# ## Train the model
# Finally, we define the function `train` that calls the functions defined above to train the model.
function train(; kws...)
args = Args(; kws...)
args.seed > 0 && Random.seed!(args.seed)
use_cuda = args.use_cuda && CUDA.functional()
if use_cuda
device = gpu
@info "Training on GPU"
device = cpu
@info "Training on CPU"
train_loader, test_loader = get_data(args)
model = LeNet5() |> device
@info "LeNet5 model: $(num_params(model)) trainable params"
ps = Flux.params.(model)
opt = [ADAM(args.η) for i in 1:5]
if args.λ > 0 ## add weight decay, equivalent to L2 regularization
opt = Optimiser(WeightDecay(args.λ), opt)
function report(epoch, model)
train = eval_loss_accuracy(train_loader, model, device)
test = eval_loss_accuracy(test_loader, model, device)
println("Epoch: $epoch Train: $(train) Test: $(test)")
@info "Start Training"
for epoch in 1:args.epochs
@time begin
Threads.@threads for p_i in 1:5
for (x, y) in train_loader
x, y = x |> device, y |> device
gs = Flux.gradient(ps[p_i]) do
ŷ = model[p_i](x)
loss(ŷ, y)
Flux.Optimise.update!(opt[p_i], ps[p_i], gs)
# ## Run the example
# We call the function `train`:
if abspath(PROGRAM_FILE) == @__FILE__
and here is the output
conv_mnist % ./conv_mnist.jl
[ Info: Training on CPU
[ Info: LeNet5 model: 222130 trainable params
[ Info: Start Training
