Hello,
I am trying to maximize a message-passing Conditional Random Field using Optim.jl, and since I am optimizing for matrices I have to supply the gradient functions myself. Each of the individual functions work when called with test data. When I call optimize
, I get the following error:
MethodError: no method matching zero(::Type{Matrix{Float64}})
I’ve tried all the suggestions I have found here and on StackExchange to no avail.
Any pointers are very appreciated!
using CSV, DataFrames, Optim, LogExpFunctions, BenchmarkTools
original_feature_table = CSV.read("./model/feature-params.txt", CSV.Tables.matrix; header=false, transpose=true);
original_transition_table = CSV.read("./model/transition-params.txt", CSV.Tables.matrix; header=false, transpose=true);
train_count = 50
train_actuals = get_actuals(count)
train_inputs = get_inputs(count)
function mp_calc_feature_difference(input, actual, feature_table, current_difference=zeros(321,10))
if isempty(input)
return current_difference
else
in_sequence = copy(input)
actual_sequence = copy(actual)
actual_character = popfirst!(actual_sequence)
feature = popfirst!(in_sequence)
feature_potentials = feature .* feature_table
feature_Z = logsumexp(sum(feature_potentials, dims=1))
unnormalized_feature_energy = sum(feature_potentials, dims=1)[actual_character]
probability = exp(unnormalized_feature_energy - feature_Z)
current_difference[:,actual_character] += ones(321) - probability * feature
mp_calc_feature_difference(in_sequence, actual_sequence, feature_table, current_difference)
end
end
function feature_gradient(inputs, actuals, feature_table)
count = length(inputs)
grad_matrix = zeros(321,10)
for i in 1:count
grad_matrix += mp_calc_feature_difference(inputs[i], actuals[i], feature_table)
end
return grad_matrix / count
end
function mp_calc_transition_difference(actual, transition_table, previous_character=0, current_difference=zeros(10,10))
if isempty(actual)
return current_difference
else
actual_sequence = copy(actual)
current_character = popfirst!(actual_sequence)
if previous_character == 0
mp_calc_transition_difference(actual_sequence, transition_table, current_character, current_difference)
else
trans_potentials = transition_table[previous_character, :]
unnormalized_trans_energy = trans_potentials[current_character]
trans_Z = sum(trans_potentials)
probability = exp(unnormalized_trans_energy - trans_Z)
current_difference[previous_character, current_character] += 1 - probability
mp_calc_transition_difference(actual_sequence, transition_table, current_character, current_difference)
end
end
end
function transition_gradient(actuals, transition_table)
count = length(actuals)
grad_matrix = zeros(10,10)
for i in 1:count
grad_matrix += mp_calc_transition_difference(actuals[i], transition_table)
end
return grad_matrix / count
end
function g!(G, x, p)
G[1] = -1 * feature_gradient(p[1], p[2], x[1])
G[2] = -1 * transition_gradient(p[2], x[2])
end
f(x, p) = -1 * mp_avg_loglikelihood(p[1], p[2], x[1], x[2])
u0 = [original_feature_table, original_transition_table]
params = [train_inputs, train_actuals]
optimize(x -> f(x, params), x-> g!(x, params), u0)