It is efficient! Case in point, see these final 2 Julia and Python versions that are shorter, only use vectorized built-ins/broadcasting and aren’t too worried about allocating intermediate arrays outside of accumulating to the weights:
using LinearAlgebra, Random, Statistics
using MLDatasets
sigmoid(z) = 1 / (1 + exp(-z))
function sigmoid_prime(z)
s = sigmoid(z)
return s * (1 - s)
end
function predict(a, biases, weights)
for (b, w) in zip(biases, weights)
a = sigmoid.(w * a .+ b)
end
return a
end
function evaluate(test_x, test_y, biases, weights)
test_results = predict(test_x, biases, weights)
max_idxs = argmax(test_results, dims=1)
# This is an array of 2d coordinates (CartesianIndex),
# which means we can use it to check whether each one-hot vector matches
return sum(test_y[max_idxs])
end
cost_prime(output, y) = output - y
function update_mini_batch(x, y, eta, biases, weights)
batch_size = size(y, 2)
a = x
as = [a]
zs = Matrix{Float64}[]
for (b, w) in zip(biases, weights)
z = w * a .+ b
push!(zs, z)
a = sigmoid.(z)
push!(as, a)
end
delta = cost_prime.(as[end], y) .* sigmoid_prime.(zs[end])
biases[end] .-= eta .* dropdims(mean(delta, dims=2), dims=2)
weights[end] .-= (eta / batch_size) .* delta * as[end - 1]'
for l in 1:length(zs) - 1
delta = weights[end - l + 1]' * delta .* sigmoid_prime.(zs[end - l])
biases[end - l] .-= eta .* dropdims(mean(delta, dims=2), dims=2)
weights[end - l] .-= (eta / batch_size) .* delta * as[end - l - 1]'
end
end
function SGD(train_x, train_y, epochs, mini_batch_size, eta, biases, weights, test_x, test_y)
n_test = size(test_y, 2)
n_train = size(train_y, 2)
train_idxs = collect(1:n_train)
for j in 1:epochs
shuffle!(train_idxs)
for batch_idxs in Iterators.partition(train_idxs, mini_batch_size)
update_mini_batch(train_x[:, batch_idxs], train_y[:, batch_idxs], eta, biases, weights)
end
res = evaluate(test_x, test_y, biases, weights)
println("Epoch $j: $res / $n_test")
end
return biases, weights
end
let
# N.B. this would be much better off as Float32, but we're keeping in Float64 for ease of comparison
train_x = reshape(MNIST.traintensor(Float64, 1:50_000), 784, :)
# I(10) allocates an identity matrix just like np.eye(10).
# We collect it to a normal array and use fancy indexing to slice 50k one-hot vectors out of it
train_y = collect(I(10))[:, MNIST.trainlabels(1:50_000) .+ 1]
test_x = reshape(MNIST.testtensor(Float64), 784, :)
test_y = collect(I(10))[:, MNIST.testlabels() .+ 1]
layer_sizes = (784, 30, 10)
biases = [randn(l) for l in layer_sizes[2:end]]
weights = [randn(ll, l) for (l, ll) in zip(layer_sizes[1:end - 1], layer_sizes[2:end])]
@time SGD(train_x, train_y, 10, 10, 3.0, biases, weights, test_x, test_y);
end
import gzip
import pickle
import numpy as np
class Network:
def __init__(self, sizes):
self.num_layers = len(sizes)
self.biases = [np.random.randn(y) for y in sizes[1:]]
self.weights = [np.random.randn(x, y) for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(a @ w + b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data):
test_x, test_y = test_data
n_test = test_y.shape[0]
train_x, train_y = training_data
n_train = train_y.shape[0]
train_idxs = np.arange(n_train)
for j in range(epochs):
np.random.shuffle(train_idxs)
for k in range(0, n_train, mini_batch_size):
batch_idxs = train_idxs[k : k + mini_batch_size]
self.update_mini_batch(
train_x[batch_idxs, :], train_y[batch_idxs, :], eta
)
print(f"Epoch {j} : {self.evaluate(test_x, test_y)} / {n_test}")
def update_mini_batch(self, x, y, eta):
batch_size = y.shape[0]
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = activation @ w + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass
delta = cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
self.biases[-1] -= eta * delta.mean(0)
self.weights[-1] -= (eta / batch_size) * (activations[-2].T @ delta)
for l in range(2, self.num_layers):
z = zs[-l]
delta = (delta @ self.weights[-l + 1].T) * sigmoid_prime(z)
self.biases[-l] -= eta * delta.mean(0)
self.weights[-l] -= (eta / batch_size) * (activations[-l - 1].T @ delta)
def evaluate(self, test_x, test_y):
max_idxs = np.expand_dims(self.feedforward(test_x).argmax(-1), -1)
return np.take_along_axis(test_y, max_idxs, -1).sum()
#### Miscellaneous functions
def cost_derivative(output_activations, y):
return output_activations - y
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
x = sigmoid(z)
return x * (1 - x)
def load_data(data_path):
with gzip.open(data_path, "rb") as f:
(train_x, train_y), _, (test_x, test_y) = pickle.load(f, encoding="latin1")
train_x = train_x.reshape(-1, 784)
train_y = np.eye(10)[train_y, :]
test_x = test_x.reshape(-1, 784)
test_y = np.eye(10)[test_y, :]
return (train_x, train_y), (test_x, test_y)
if __name__ == "__main__":
import time
training_data, test_data = load_data("mnist.pkl.gz")
training_data = list(training_data)
net = Network((784, 30, 10))
tstart = time.perf_counter()
net.SGD(training_data, 10, 10, 3.0, test_data=test_data)
tstop = time.perf_counter()
print(f"{tstop - tstart:0.4f}")
Benchmarking on my machine, these are within ~200ms of each other and at least 2x faster than the next fastest solution so far. The trick? We make as much use of the efficient default matmul as possible by passing in a batch of data instead of feeding in one sample at a time. This is exactly how most ML frameworks do things, and the original author was so, so close to doing it too (but inexplicably decided not to). I’d argue the implementation isn’t any more complex or obscure than the original either. For reference, most of the actual calculations were pulled from an old homework assignment
So TL;DR, the default matmul is the default because it’s darned fast, and it’s darned fast in most languages because they wrap similar optimized libraries (OpenBLAS, MKL, BLIS, etc). For better or worse, you’re unlikely to squeeze more juice out of this benchmark because most of the load-bearing routines have been optimized to the nines already. Where a language which can perform aggressive compiler optimizations like Julia shines is in problems that don’t have existing optimized libraries ready, see Useful Algorithms That Are Not Optimized By Jax, PyTorch, or Tensorflow - Stochastic Lifestyle for a great example.