Hi,
I am trying to write a function to compute the derivative of a cost function.
The cost function is used to express how good a distance function is at separating labeled data into its respective classes. Iβm trying to use the derivative to run gradient descent to try and find a good set of weights for this distance function. To do this I will need to run the derivative a lot of times in the optimization process, so I want it to be a fast calculation if possible.
I am open to any packages/other work that can help me solve this problem. I checked for some Julia optimization packages, but none seemed to be for large derivatives like this. So I wrote my own function representing the derivative.
Hereβs the code:
"""
Computes the derivative of the cost function wrt the weights of the distance function
param dist : Distance object from Distances.jl package
param S : R+ -> R+, Score function, monotonically decreasing (such as E^-x)
param Sβ²: R+ -> R, Derivative of score function
param X: Vector of feature matrices per class.
"""
function costd(dist :: WeightedPeriodicNormalizedMinkowski, S :: Function , Sβ² :: Function, X::Vector{Matrix})
d(p, q) = (x -> x == 0 ? 10^-6 : x)(dist(p,q))
cost = 0
dw = zeros(length(dist.weights))
for Xi in X # for class in X
for xi in eachcol(Xi) # for point in current class
# scored distances to points in the same class
DXi = sum(S(dist(xi, q)) for q = eachcol(Xi))
DXl = sum(S(dist(xi, q)) for Xl = X if Xl != Xi for q = eachcol(Xl) )
cost += DXl / DXi
for i in 1:length(dist.weights)
dDXi = sum(Distances.eval_op(dist, xi[i], q[i], dist.periods[i], dist.variances[i], 1) * d(xi, q) ^ (1 - dist.p) * Sβ²(dist(xi, q)) for q = eachcol(Xi) )
dDXl = sum(Distances.eval_op(dist, xi[i], q[i], dist.periods[i], dist.variances[i], 1) * dist(xi, q) ^ (1 - dist.p) * Sβ²(dist(xi, q)) for Xl in X if Xl != Xi for q = eachcol(Xl) )
dw[i] += 1/dist.p * (dDXl - DXl * dDXi / DXi ) / DXi
end
end
end
dw, cost
end
With two classes of 300 points each, this takes 30s to compute.
And here it is embedded in a MWE:
MWE code
module Optimizer
import Distances
struct WeightedPeriodicNormalizedMinkowski{W <: AbstractArray, T <: AbstractArray, V <: AbstractArray, P <: Real} <: Distances.UnionMetric
weights:: W
periods:: T
variances:: V
p:: P
end
Base.repeat(wpm::WeightedPeriodicNormalizedMinkowski, r:: Integer) = WeightedPeriodicNormalizedMinkowski(
repeat(wpm.weights, r)/r, repeat(wpm.periods, r), repeat(wpm.variances, r), wpm.p)
# -------------------------------------------------------------------------------------
# Help me optimize this please!
# for reference, computing a distance matrix requires only 2-3s on dataset with 20 classes with ~300 data points each.
"""
Computes the derivative of the cost function wrt the weights of the distance function
param dist : Distance object from Distances.jl package
param S : R+ -> R+, Score function, monotonically decreasing (such as E^-x)
param Sβ²: R+ -> R, Derivative of score function
param X: Vector of feature matrices per class.
"""
function costd(dist :: WeightedPeriodicNormalizedMinkowski, S :: Function , Sβ² :: Function, X::Vector{Matrix})
d(p, q) = (x -> x == 0 ? 10^-6 : x)(dist(p,q))
cost = 0
dw = zeros(length(dist.weights))
for Xi in X # for class in X
for xi in eachcol(Xi) # for point in current class
# scored distances to points in the same class
DXi = sum(S(dist(xi, q)) for q = eachcol(Xi))
DXl = sum(S(dist(xi, q)) for Xl = X if Xl != Xi for q = eachcol(Xl) )
cost += DXl / DXi
for i in 1:length(dist.weights)
dDXi = sum(Distances.eval_op(dist, xi[i], q[i], dist.periods[i], dist.variances[i], 1) * d(xi, q) ^ (1 - dist.p) * Sβ²(dist(xi, q)) for q = eachcol(Xi) )
dDXl = sum(Distances.eval_op(dist, xi[i], q[i], dist.periods[i], dist.variances[i], 1) * dist(xi, q) ^ (1 - dist.p) * Sβ²(dist(xi, q)) for Xl in X if Xl != Xi for q = eachcol(Xl) )
dw[i] += 1/dist.p * (dDXl - DXl * dDXi / DXi ) / DXi
end
end
end
dw, cost
end
# ------------------------------------------------------------------------------------
# Example of how to use this function:
X = [ 100 * rand(Float64, (48, rand(150:400))), 75 * rand(Float64, (48, rand(300:400)))]
S(x) = exp(-x)
Sp(x) = -exp(-x)
normalize(v::Vector) = v / sum(v)
distance = repeat(WeightedPeriodicNormalizedMinkowski(
normalize([12.0, 2.0, 1.0, 6.0, 3.0, 2.0, 2.0, 6.0]), # weights
[Inf, 360.0, 180, Inf, Inf, Inf, Inf, 360], # periods
[1.0, 5, 10, 20, 20, 20, 20, 10], # measurement variance
2),
6)
# takes about 30s
# costd(distance, S, Sp, X)
# A real world use case would be about 50 classes with 500 data points each
# X = [ 100 * rand(Float64, (48, rand(150:400))) for _ = 1:50]
# costd(distance, S, Sp, X) # doesn't finish
# --------------------------------------------------------
# Everything below this is to patch Distances.jl for a metric that has 3 parameters, you can ignore it
# --------------------------------------------------------
WeightedPeriodicNormalizedMinkowski(weights::AbstractArray) = (periods::AbstractArray) -> (variances::AbstractArray) -> (p :: Real) -> WeightedPeriodicNormalizedMinkowski(weights, periods, variances, p)
WeightedPeriodicNormalizedMinkowski(weights::AbstractArray, periods::AbstractArray, variances::AbstractArray) = WeightedPeriodicNormalizedMinkowski(weights)(periods)(variances)
WeightedPeriodicNormalizedMinkowski(wpm::WeightedPeriodicNormalizedMinkowski, p::Real) = WeightedPeriodicNormalizedMinkowski(wpm.weights, wpm.periods, wpm.variances, p)
Distances.parameters(wpm::WeightedPeriodicNormalizedMinkowski) = (wpm.periods, wpm.variances, wpm.weights)
@inline function Distances.eval_op(d::WeightedPeriodicNormalizedMinkowski, ai, bi, Ti, vi, wi)
# Taken from the PeriodicEuclidean function
s1 = abs(ai - bi)
s2 = mod(s1, Ti)
# taken from the WeightedMinkowski function with some modifications.
(min(s2, Ti - s2) / vi)^d.p * wi
end
@inline Distances.eval_end(d::WeightedPeriodicNormalizedMinkowski, s) = s^(1/d.p)
wpnminkowski(a, b, w::AbstractArray, T::AbstractArray, v::AbstractArray, p::Real) = WeightedPeriodicNormalizedMinkowski(w, T, v, p)(a, b)
wpnminkowski(a, w::AbstractArray, T::AbstractArray, v::AbstractArray, p::Real) = wpnminkowski(a, a, w, T, v, p)
(w::WeightedPeriodicNormalizedMinkowski)(a,b) = Distances._evaluate(w, a, b)
Distances._eval_start(d::Distances.UnionMetrics, ::Type{Ta}, ::Type{Tb}, (p1, p2, p3)) where {Ta,Tb} =
zero(typeof(Distances.eval_op(d, oneunit(Ta), oneunit(Tb), oneunit(Distances.eltype(p1)), oneunit(Distances.eltype(p2)), oneunit(Distances.eltype(p3)) )))
function Distances._evaluate(dist::Distances.UnionMetrics, a::Number, b::Number, p1::Number, p2::Number, p3::Number)
Distances.eval_end(dist, Distances.eval_op(dist, a, b, p1, p2, p3))
end
Distances.result_type(dist::Distances.UnionMetrics, ::Type{Ta}, ::Type{Tb}, (p1, p2, p3)) where {Ta,Tb} =
typeof(Distances._evaluate(dist, oneunit(Ta), oneunit(Tb), oneunit(Distances.eltype(p1)), oneunit(Distances.eltype(p2)), oneunit(Distances.eltype(p3))))
Base.@propagate_inbounds function Distances._evaluate(d::Distances.UnionMetrics, a::AbstractArray, b::AbstractArray, (p1, p2, p3)::Tuple{AbstractArray, AbstractArray, AbstractArray})
@boundscheck if length(a) != length(b)
throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
end
@boundscheck if length(a) != length(p1)
throw(DimensionMismatch("arrays have length $(length(a)) but parameter 1 has length $(length(p1))."))
end
@boundscheck if length(a) != length(p2)
throw(DimensionMismatch("arrays have length $(length(a)) but parameter 2 has length $(length(p2))."))
end
@boundscheck if length(a) != length(p3)
throw(DimensionMismatch("arrays have length $(length(a)) but parameter 3 has length $(length(p3))."))
end
if length(a) == 0
return zero(Distances.result_type(d, a, b))
end
@inbounds begin
s = Distances.eval_start(d, a, b)
if (IndexStyle(a, b, p1, p2, p3) === IndexLinear() && eachindex(a) == eachindex(b) == eachindex(p3) == eachindex(p1)) == eachindex(p2) ||
axes(a) == axes(b) == axes(p3) == axes(p1) == axes(p2)
@simd for I in eachindex(a, b, p1, p2, p3)
ai = a[I]
bi = b[I]
p1i = p1[I]
p2i = p2[I]
p3i = p3[I]
s = Distances.eval_reduce(d, s, Distances.eval_op(d, ai, bi, p1i, p2i, p3i))
end
else
for (ai, bi, p1i, p2i, p3i) in zip(a, b, p1, p2, p3)
s = Distances.eval_reduce(d, s, Distances.eval_op(d, ai, bi, p1i, p2i, p3i))
end
end
return Distances.eval_end(d, s)
end
end
end