Ok, now I think we ended up with the final code. Many thanks for all your help.
using ForwardDiff
n = 100
p = 10
x = randn(n,p)'
y = sum(x[1:5,:],1) .+ randn(n)'*0.1
w = 0.0001*randn(1,p)
b = 0.0
# squared error loss function
loss(w,b,x,y) = sumabs2(y - (w*x .+ b)) / size(y,2)
# get gradient w.r.t to `w`
loss∇w(w, b, x, y) = ForwardDiff.gradient(w -> loss(w, b, x, y), w)
# get derivative w.r.t to `b` (`ForwardDiff.derivative` is
# used instead of `ForwardDiff.gradient` because `b` is
# a scalar instead of an array)
lossdb(w, b, x, y) = ForwardDiff.derivative(b -> loss(w, b, x, y), b)
# proximal gradient descent function
function train(w, b, x, y ; lr=.1)
w -= scale!(lr, loss∇w(w, b, x, y))
b -= lr * lossdb(w, b, x, y)
return w, b
end
# iterator that collects the weights and bias, and prints the loss
for i=1:50; w, b = train(w, b, x, y); println(loss(w,b,x,y));