Subset sum problem with SymbolicRegressions.jl

Trying to solve Digits: A Daily Math Puzzle - The New York Times with Julia and it’s clearly a NP-complete problem because it’s harder than the Subset sum problem which is known to be NP-complete.

The current attempt is:

julia> using SymbolicRegression

julia> function myloss(tree, dataset, options)
           @show tree
       end
myloss (generic function with 2 methods)

julia> options = SymbolicRegression.Options(
           binary_operators=[+, *, SymbolicRegression.div, -],
           npopulations=1,
           complexity_of_constants=99999,
           loss_function = myloss
       );

julia> X = reshape([1,2,4,5,10,25], 6, 1); y = [94.0]
1-element Vector{Float64}:
 94.0

julia> hof = EquationSearch(X, y; options=options, niterations=30)
tree = 94.0
ERROR: MethodError: Cannot `convert` an object of type Node{Float64} to an object of type Float64

The error makes sense but the showed tree = 94.0 is a constant solution which should have been prohibited due to large complexity_of_constants, any hint?

1 Like

You succeeded in your nerd snipe :grin:

Here’s the fix:

using SymbolicRegression

# Count operators:
function count_binary_operators(tree::Node)::Dict{Int,Int}
    if tree.degree == 0
        Dict{Int,Int}()
    elseif tree.degree == 1
        count_binary_operators(tree.l)
    else
        left = count_binary_operators(tree.l)
        right = count_binary_operators(tree.r)
        # Merge counts:
        for (k,v) in right
            left[k] = get(left, k, 0) + v
        end
        # Add count for this node:
        left[tree.op] = get(left, tree.op, 0) + 1
        left
    end
end

function myloss(tree, dataset::Dataset{T,L}, options) where {T,L}
    prediction, completed = eval_tree_array(tree, dataset.X, options)
    !completed && return L(Inf)
    loss = sum(abs2, prediction - dataset.y)

    counts = count_binary_operators(tree)
    penalty = L(0)
    for op in 1:4
        if !haskey(counts, op) || counts[op] != 1
            penalty += L(1)
        end
    end

    return loss + penalty
end

options = Options(;
    binary_operators=[+, *, /, -],
    complexity_of_constants=100,
    loss_function=myloss,
)

EquationSearch([1 2 4 5 10 25]', [94.0]; options)

This gave me the output:

(((x5 * x5) - (x1 + x4)) / x1)

Is that correct? Or maybe another constraint is needed?

1 Like

If you need to have at most 1 use of every integer as well, then here’s the fix:

using SymbolicRegression

function combine_counts(l, r)
    for (k, v) in r
        l[k] = get(l, k, 0) + v
    end
    l
end


# Count operators and features
function get_counts(tree::Node)
    if tree.degree == 0
        Dict{Int,Int}(), (tree.constant ? Dict{Int,Int}() : Dict([tree.feature => 1]))
    elseif tree.degree == 1
        get_counts(tree.l)
    else
        left_ops, left_features = get_counts(tree.l)
        right_ops, right_features = get_counts(tree.r)

        # Merge counts:
        ops = combine_counts(left_ops, right_ops)
        features = combine_counts(left_features, right_features)

        # Add count for this node:
        ops[tree.op] = get(ops, tree.op, 0) + 1

        ops, features
    end
end

function myloss(tree, dataset::Dataset{T,L}, options) where {T,L}
    prediction, completed = eval_tree_array(tree, dataset.X, options)
    !completed && return L(Inf)

    loss = sum(abs2, prediction .- dataset.y)

    operator_counts, feature_counts = get_counts(tree)
    penalty = L(0)

    for op in 1:4
        if haskey(operator_counts, op) && operator_counts[op] > 1
            penalty += L(100)
        end
    end

    for feature in 1:size(dataset.X, 1)
        if haskey(feature_counts, feature) && feature_counts[feature] > 1
            penalty += L(100)
        end
    end

    return loss + penalty
end

options = Options(;
    binary_operators=[+, *, /, -],
    complexity_of_constants=100,
    loss_function=myloss,
)

X = [1 2 4 5 10 25]
y = [94.0]

EquationSearch(X', y;
               options,
               niterations=1000,
               varMap=[string(x) for x in [X...]],
)

This gives me:

((4 * 25) - (5 + 1))

which looks correct

3 Likes

Solving today’s Digits problem:

I plug in:

EquationSearch([2 3 5 11 15 25]', [59.0]; options, niterations=1000, varMap=["2", "3", "5", "11", "15", "25"])

which gives me:

((3 * 15) + (25 - 11))

which is correct:

amazing. Sorry for the nerd snipe but this is amazing

1 Like

Fixed it allow some operators to not be used. So now it finds the minimal solution.

1 Like

btw why is there a conflict between Base.div and SR.div?

At one point I needed to have a separate SymbolicRegression.div to trigger a separate printing method. But it isn’t needed anymore and I could remove it.

(It automatically converts (/) => SymbolicRegression.div in the construction of Options)

1 Like