The main issue is that all(sum(combination) .== sum_target) allocates two arrays at each iteration of the for loop, which ends up being very costly. You could instead do the comparisons lazily, in a way that fully avoids allocations. Most importantly, as @Oscar_Smith suggested, this would also allow you to terminate the computations as soon as there is a disagreement, instead of having to compute the whole sum before doing the comparisons.
Below are a few alternatives. Note that I have moved the allocation of the large arrays to outside of the function.
function f2(array_1, array_2, sum_target)
solutions = Tuple{eltype(array_1), eltype(array_2)}[]
iterator_over_combinations = Iterators.product(array_1, array_2)
for combination in iterator_over_combinations
if all(sum(combination) .== sum_target)
push!(solutions, combination)
end
end
solutions
end
function f3(array_1, array_2, sum_target)
solutions = Tuple{eltype(array_1), eltype(array_2)}[]
iterator_over_combinations = Iterators.product(array_1, array_2)
for combination in iterator_over_combinations
success = all(
vs -> vs[1] + vs[2] == vs[3],
zip(combination..., sum_target),
)
if success
push!(solutions, combination)
end
end
solutions
end
function f4(array_1, array_2, sum_target)
solutions = Tuple{eltype(array_1), eltype(array_2)}[]
iterator_over_combinations = Iterators.product(array_1, array_2)
for combination in iterator_over_combinations
success = true
a, b = combination
for i ∈ eachindex(sum_target)
if a[i] + b[i] ≠ sum_target[i]
success = false
break
end
end
if success
push!(solutions, combination)
end
end
solutions
end
n_elements = 50
m = 500
array_1 = [Int64.(rand(Int8, m)) for i in 1:n_elements]
array_2 = [Int64.(rand(Int8, m)) for i in 1:n_elements]
sum_target = Int64.(rand(Int8, m))
@btime f() # 5.152 ms (10205 allocations: 20.98 MiB)
@btime f2($array_1, $array_2, $sum_target) # 5.142 ms (10001 allocations: 20.52 MiB)
@btime f3($array_1, $array_2, $sum_target) # 14.733 μs (1 allocation: 64 bytes)
@btime f4($array_1, $array_2, $sum_target) # 10.009 μs (1 allocation: 64 bytes)
Note that f2 is nearly the same as f, and the performance is the same, while the f3 and f4 variants are almost 3 orders of magnitude faster. Moreover, f4 is (in my opinion) more readable and ends up being the fastest option.