Flux.jl: Different set of optimisation parameters per layer

Hi,

Just copying over my question from the GitHub issues to hopefully get more visibility here.

In Flux.jl, I’d like to use a separate set of optimisation parameters for each hidden layer in a single Chain. For example, if I had a pre-trained network I wanted to append layers to, I’d want to make the learning rate for those layers much less than the learning rate for my new layers. Maybe I’d want the momentum, dropout, etc. to be different as well.

Looking at the source code, using multiple Optimisers during training may have once been possible, but nothing elsewhere in the documentation (from what I can tell) indicates that it is now. Am I missing something obvious? Thanks in advance!

Edit: Using Julia 1.0.3 and Flux 0.7.3

Update: this should work just fine for my purposes.

distribute(opts, m) = collect(Iterators.flatten([repeat([opt], length(params(l))) for (l, opt) in zip(m, opts)]))

function train!(loss, ps, data, opts::Array; cb = () -> ())
  cb = runall(cb)
	ps = Params(ps)
  for d in data
    try
      gs = gradient(ps) do
        loss(d...)
      end
      for (p, opt) in zip(ps, opts) 
        update!(opt, p, gs[p])
      end
      if cb() == :stop
        depwarn("Use of `:stop` is deprecated; use `Flux.stop()` instead", :stop)
        break
      end
    catch ex
      if ex isa StopException
        break
      else
        rethrow(ex)
      end
    end
  end
end

m = Chain(Dense(2,4,tanh), Dense(2,4,tanh))

opts = distribute([Descent(0.3), Descent(0.1)], m) # [Descent(0.3), Descent(0.3), Descent(0.1), Descent(0.1)]

@epochs 500 train!(loss, params(m), data, opts)