Hello,
I am trying to maximize a message-passing Conditional Random Field using Optim.jl, and since I am optimizing for matrices I have to supply the gradient functions myself. Each of the individual functions work when called with test data. When I call optimize, I get the following error:
MethodError: no method matching zero(::Type{Matrix{Float64}})
I’ve tried all the suggestions I have found here and on StackExchange to no avail.
Any pointers are very appreciated!
using CSV, DataFrames, Optim, LogExpFunctions, BenchmarkTools
original_feature_table = CSV.read("./model/feature-params.txt", CSV.Tables.matrix; header=false, transpose=true);
original_transition_table = CSV.read("./model/transition-params.txt", CSV.Tables.matrix; header=false, transpose=true);
train_count = 50
train_actuals = get_actuals(count)
train_inputs = get_inputs(count)
function mp_calc_feature_difference(input, actual, feature_table, current_difference=zeros(321,10))
	if isempty(input)
		return current_difference
	else
		in_sequence = copy(input)
		actual_sequence = copy(actual)
		actual_character = popfirst!(actual_sequence)
		
		feature = popfirst!(in_sequence)
		feature_potentials = feature .* feature_table
		feature_Z = logsumexp(sum(feature_potentials, dims=1))
		unnormalized_feature_energy = sum(feature_potentials, dims=1)[actual_character]
		probability = exp(unnormalized_feature_energy - feature_Z)
		current_difference[:,actual_character] += ones(321) - probability * feature
		mp_calc_feature_difference(in_sequence, actual_sequence, feature_table, current_difference)
	end
end
function feature_gradient(inputs, actuals, feature_table)
	count = length(inputs)
	grad_matrix = zeros(321,10)
	for i in 1:count
		grad_matrix += mp_calc_feature_difference(inputs[i], actuals[i], feature_table)
	end
	return grad_matrix / count
end
function mp_calc_transition_difference(actual, transition_table, previous_character=0, current_difference=zeros(10,10))
	if isempty(actual)
		return current_difference
	else	
		actual_sequence = copy(actual)
		current_character = popfirst!(actual_sequence)
		if previous_character == 0
			mp_calc_transition_difference(actual_sequence, transition_table, current_character, current_difference)
		else
			trans_potentials = transition_table[previous_character, :]
			unnormalized_trans_energy = trans_potentials[current_character]
			trans_Z = sum(trans_potentials)
	
			probability = exp(unnormalized_trans_energy - trans_Z)
			current_difference[previous_character, current_character] += 1 - probability
	
			mp_calc_transition_difference(actual_sequence, transition_table, current_character, current_difference)
		end
	end
end
function transition_gradient(actuals, transition_table)
	count = length(actuals)
	grad_matrix = zeros(10,10)
	for i in 1:count
		grad_matrix += mp_calc_transition_difference(actuals[i], transition_table)
	end
	return grad_matrix / count
end
function g!(G, x, p)
	
	G[1] = -1 * feature_gradient(p[1], p[2], x[1])
	G[2] = -1 * transition_gradient(p[2], x[2])
end
f(x, p) = -1 * mp_avg_loglikelihood(p[1], p[2], x[1], x[2])
u0 = [original_feature_table, original_transition_table]
params = [train_inputs, train_actuals]
optimize(x -> f(x, params), x-> g!(x, params), u0)