In fact, here’s some code I wrote awhile ago exploring the use of ForwardDiff. I think some of the syntax is suboptimal: for example, I recall learning there were better ways of providing arguments for which the ForwardDiff will not take the derivative of. But I think it demonstrates how easy it is learn/use.
## EXAMPLES WITH ForwardDiff package
using ForwardDiff
# Distributions is only for simulating data
using Distributions
# ForwardDiff can do autodifferentation with functions that take in
# a single Vector ONLY
# Note that f only takes ONE argument
f(x::Vector) = sum(x.^2)
# Now we can make a function that takes the
# gradient of the function f with respect to x
g = x -> ForwardDiff.gradient(f, x)
# ...or the Hessian!
h = x -> ForwardDiff.hessian(f, x)
# Demonstration:
println("Gradient: ")
println( g([2,3]) ) # Gradient will be 2x
println( "Hessian: ")
println( h([2,3]) ) # Hessian will be 2 * identity matrix
# INCLUDING OTHER CONSTANTS
# ForwardDiff expects us to provide a *single* vector, for
# which we will take all the derivatives for
# But how do we include other values that affect the derivative,
# for which we do not want to take the derivative of (i.e., data in stats)?
# My answer so far: leave them laying around in the environment above!
# I don't love this method, but it's all I got right now
# a, b will affect our function, but we do not want to take their derivatives
a = 5.0
b = 3.0
# This function takes a Vector, but accesses a, b,
# which are NOT provided as arguments
function f(x::Vector)
ans = a * x[1] + b * x[2]
return(ans)
end
# Taking the gradient with respect to x
g = x -> ForwardDiff.gradient(f, x)
# Checking
println("Gradient = ")
# Should be values of a, b defined in Global Environment
println( g([0,0]) )
# NESTING FUNCTIONS
# It seems like we can arbitrarily nest functions
# and it'll figure it out for us
f1(x) = x.^2
f2(x) = sum(x)
f3(x) = f2(f1(x))
g = x -> ForwardDiff.gradient(f3, x)
println("Gradient = ")
# Should be 2 .* x
println( g([3, 2, 1]) )
# USING CONTROL STATEMENTS
# This function is just the sum of absolute values
# The motivation for this was testing if autodifferentation
# could handle control statements (answer:yes!)
function f(x::Vector)
k = length(x)
ans = 0.0
for i in 1:k
if x[i] > 0.0
ans = ans + x[i]
else
ans = ans - x[i]
end
end
return(ans)
end
g = x -> ForwardDiff.gradient(f, x)
println("Gradient = ")
# Should just be sign(x)
println( g([-1.0,1.0, 2.0, -2.0]) )
# GLM Example!
# Now we will get a little fancier. We're going to solve
# for logistic regression using Newton Raphson
# For transforming from logit to probabilities
expit(x::Real) = exp(x) / (1.0 + exp(x) )
# For computing log-likelihood from estimated p-hats
function logistic_llk(p_hat, y)
n = length(y)
ans = 0.0
for i in 1:n
if y[i] == 1
ans = ans + log(p_hat[i])
else
ans = ans + log(1.0 - p_hat[i])
end
end
return(ans)
end
# For computing p-hats
# Assuming that intercept is left out
function compute_p_hat(X, beta)
k = length(beta)
# The first value of beta is the intercept
# so we will individually add it and
# use matrix multiplication for the rest of the betas
eta = beta[1] .+ (X * beta[2:k])
ans = expit.(eta)
return(ans)
end
# Final function that calls everything
function logReg(X, y, its = 5, verbose = true)
# Extracting dimensions
X_dims = size(X)
nRows = X_dims[1]
nCols = X_dims[2]
# Setting initial values
# We're assuming X has no intercept
# so length(beta) = nCols + 1
betas = zeros(nCols + 1)
# Base function that we will call our
# 1st and 2nd order derivatives on
base_fxn = function(betas::Vector)
# X will be found in the environment above
p_hats = compute_p_hat(X, betas)
# As will y
ans = logistic_llk(p_hats, y)
return(ans)
end
# Creating gradient + hessian functions
# Note: I think you might be able to actually
# call the gradient from the hessian function (so g is redundant)
# But let's get code working before we optimize it!
g = x->ForwardDiff.gradient(base_fxn, x)
h = x->ForwardDiff.hessian(base_fxn, x)
current_llk = base_fxn(betas)
if verbose
println(string("Current llk:", current_llk) )
end
# Runs a fixed number of vanilla Newton Raphson steps
for i in 1:its
# Grabbing current gradients
g_vec = g(betas)
h_mat = h(betas)
# -\(M, v) solves M for v (i.e., R::solve(M,v) )
# So line below gets the proposal step for Newton Raphson
delta = -\(h_mat, g_vec)
betas = betas + delta
# Computes current log-likelihood
current_llk = base_fxn(betas)
if verbose
# Prints out current log-likelihood at this step of the algorithm
println(string("Current llk:", current_llk) )
end
end
if verbose
println("Final betas = ", betas)
end
return(betas)
end
# Funciton for simulating data
function simData(nRow, nCol)
X = randn(nRow, nCol)
trueBetas = randn(nCol + 1)
etas = X * trueBetas[2:(nCol + 1)] .+ trueBetas[1]
p = expit.(etas)
y = rand.(Binomial.(1, p))
ans = (X = X, y = y, trueBetas = trueBetas)
end
# Size of data
nCol = 2
nRow = 5000
sim_problem = simData(nRow, nCol)
println("STARTING LOGISTIC REGRESSION")
println(string("True values of beta = ", sim_problem[3]) )
start = time()
logReg(sim_problem[1], sim_problem[2])
finish = time()
println("Time = ", finish - start)
# Now let's try with 10x more rows
# If autodifferentation is smart, should be 10x slower
# If dumb, 100x slower
nRow = 50000
sim_problem = simData(nRow, nCol)
start = time()
logReg(sim_problem[1], sim_problem[2])
finish = time()
println("Time = ", finish - start)
# Looks like it is smart!