(I deleted the above post because I found an error in my code, it’s fixed now)
As others have mentioned, optimized Python code translated literally is often very slow Julia code. In Julia, you want to write loops, while in Python you have to avoid them. Having changed a few lines of vector operations to loops, the Julia implementation becomes faster than the Python one. Note that I did not change the algorithm, I just expanded what numpy is doing internally. The new code runs in 30s on my computer compared to 40s for python. I just took the most low-hanging fruit. I believe that, with some effort, another factor of 2 might be possible.
code
using MLDatasets, Random, BenchmarkTools, LoopVectorization, LinearAlgebra
function vectorized_result(y)
e = zeros(10)
e[y + 1] = 1
return e
end
process_sample(x, y) = collect(reshape(x, 28 * 28, 1)), vectorized_result(y)
@inline sigmoid(z) = 1 / (1 + exp(-z))
@inline function sigmoid_prime(z)
s = sigmoid(z)
return s * (1 - s)
end
function predict(a, biases, weights)
for (b, w) in zip(biases, weights)
a = sigmoid.(w * a .+ b)
end
return vec(a)
end
function evaluate(test_data, biases, weights)
test_results = [(argmax(predict(x, biases, weights)), y) for (x, y) in test_data]
return sum(y[pred] for (pred, y) in test_results)
end
cost_prime(output, y) = output - y
function myproduct(out, A, x, b)
(Nx, Ny) = size(A)
for i = 1:Ny
out_i = 0.0
for j = 1:Nx
out_i += A[j,i]*x[j]
end
out[i] = out_i
end
for i = 1:Ny
out[i] *= sigmoid_prime(b[i])
end
end
function backpropagation!(delta_nabla_b, delta_nabla_w, delta, x, y, biases, weights)
a = x
as = [a]
zs = Matrix{Float64}[]
for (b, w) in zip(biases, weights)
z = w * a + b
push!(zs, z)
a = sigmoid.(z)
push!(as, a)
end
delta[1] .= cost_prime.(as[end], y) .* sigmoid_prime.(zs[end])
delta_nabla_b[end] .= delta[1]
delta_nabla_w[end] .= delta[1] * as[end - 1]'
for l in 1:length(zs) - 1
myproduct(delta[l+1], weights[end - l + 1], delta[l], zs[end - l])
delta_nabla_b[end - l] .= delta[l+1]
mul!(delta_nabla_w[end - l], delta[l+1], as[end - l - 1]')
end
return delta_nabla_b, delta_nabla_w
end
function update_mini_batch(minibatch, eta, biases, weights, delta_nabla_b, delta_nabla_w, delta)
for (x, y) in minibatch
delta_nabla_b, delta_nabla_w = backpropagation!(delta_nabla_b, delta_nabla_w, delta, x, y, biases, weights)
# biases = [b - (eta / length(minibatch)) * nb for (b, nb) in zip(biases, delta_nabla_b)]
# weights = [w - (eta / length(minibatch)) * nw for (w, nw) in zip(weights, delta_nabla_w)]
for i in eachindex(biases)
for j in eachindex(biases[i])
biases[i][j] -= (eta / length(minibatch)) * delta_nabla_b[i][j]
end
end
for i in eachindex(weights)
@turbo for j in eachindex(weights[i])
weights[i][j] -= (eta / length(minibatch)) * delta_nabla_w[i][j]
end
end
end
return biases, weights
end
function SGD(train_data, epochs, mini_batch_size, eta, biases, weights, test_data, delta_nabla_b, delta_nabla_w, delta)
n_train = length(train_data)
for j in 1:epochs
shuffle!(train_data)
train_minibatches = [train_data[k:k + mini_batch_size - 1] for k in 1:mini_batch_size:n_train]
for minibatch in train_minibatches
(biases, weights) = update_mini_batch(minibatch, eta, biases, weights, delta_nabla_b, delta_nabla_w, delta)
end
res = evaluate(test_data, biases, weights)
print("Epoch $j: $res / $(length(test_data))\n")
end
return biases, weights
end
let
Random.seed!(1234)
train_x, train_y = MNIST.traindata(Float64, 1:50_000);
train_data = process_sample.(eachslice(train_x, dims=3), train_y);
valid_x, valid_y = MNIST.traindata(Float64, 50_001:60_000);
valid_data = process_sample.(eachslice(valid_x, dims=3), valid_y);
test_x, test_y = MNIST.testdata(Float64);
test_data = process_sample.(eachslice(test_x, dims=3), test_y);
layer_sizes = [784, 30, 10];
biases = [randn(l, 1) for l in layer_sizes[2:end]];
weights = [randn(ll, l) for (l, ll) in zip(layer_sizes[1:end - 1], layer_sizes[2:end])];
delta_nabla_b = [similar(b) for b in biases]
delta_nabla_w = [similar(w) for w in weights]
delta = [similar(b) for b in biases[end:-1:1]]
SGD(train_data, 1, 10, 3.0, biases, weights, test_data, delta_nabla_b, delta_nabla_w, delta);
@time SGD(train_data, 10, 10, 3.0, biases, weights, test_data, delta_nabla_b, delta_nabla_w, delta);
#@profview SGD(train_data, 1, 10, 3.0, biases, weights, test_data);
end
Edit: with an extra @turbo remaining time is 17s.