Since this topic has come up again in a different thread, I took the liberty to write out what this would look like. The following is a combination of the Flux quick start and the example in ParameterSchedulers.jl.
using Flux, Optimisers, ParameterSchedulers
noisy = rand(Float32, 2, 1000) # 2×1000 Matrix{Float32}
truth = [xor(col[1]>0.5, col[2]>0.5) for col in eachcol(noisy)] # 1000-element Vector{Bool}
model = Chain(
Dense(2 => 3, tanh), # activation function inside layer
BatchNorm(3),
Dense(3 => 2),
softmax)
target = Flux.onehotbatch(truth, [true, false]) # 2×1000 OneHotMatrix
loader = Flux.DataLoader((noisy, target) |> gpu, batchsize=64, shuffle=true);
const lr = 0.01
optim = Flux.setup(Flux.Adam(lr), model) # setup optimizer as usual
sched = Stateful(Step(lr, 0.9, 100)) # setup schedule of your choice
for epoch in 1:1_000
for (x, y) in loader
loss, grads = Flux.withgradient(model) do m
y_hat = m(x)
Flux.crossentropy(y_hat, y)
end
Flux.update!(optim, model, grads[1])
# NEW
nextlr = ParameterSchedulers.next!(sched) # advance schedule
Optimisers.adjust!(optim, nextlr) # update optimizer state, by default this changes the learning rate `eta`
end
end