Hello. Given the promises of performance, I’ve started my Julia work with a translation of a simple deep learning algo to Julia. My Julia code is below and the original Python code network.py is from the book by Michael Nielsen (GitHub - mnielsen/neural-networks-and-deep-learning: Code samples for my book "Neural Networks and Deep Learning"). The Julia performance is about 2.5 times slower than the Python one. Please help to understand where I am loosing my performance. I know the code is not optimized (both Julia and Python), but they are in line with one another. If I can at least match Python’s performance, that would be great, but, of course, it is desirable to outperform it. Thanks.

P.S. I’ve made changes based on several suggestions, but underperformance remains at the same level. I’ve been looking at CPU utilization and it is at about 30% with Julia code and 70% with Python code. Julia seems to be running on 1 core, while Python is not that clear, but looks like it utilizes more cores by default (does this make sense?). If that is the case, then I need to parallelize the Julia code to be compared with Python at the same CPU utilization. Any suggestions? Thanks guys.

===================================================================

```
using MLDatasets, Random
# Data
train_x_orig, train_y_orig = MNIST.traindata();
test_x_orig, test_y_orig = MNIST.testdata();
function vectorized_result(y)
e = zeros(10)
e[y+1] = 1
return e
end
train_x, train_y = Float64.(reshape(train_x_orig[:,:,1:50000], (28*28, :))), [vectorized_result(y) for y in train_y_orig[1:50000]];
valid_x, valid_y = Float64.(reshape(train_x_orig[:,:,50001:end], (28*28, :))), train_y_orig[50001:end];
test_x, test_y = Float64.(reshape(test_x_orig, (28*28, :))), test_y_orig;
function sigmoid(z)
return @. 1.0./(1.0 + exp(-z))
end
function sigmoid_prime(z)
s = sigmoid(z);
return s.*(1-s);
end
function predict(a, biases, weights)
for (b, w) in zip(biases, weights)
a = sigmoid.(w*a .+ b)
end
return vec(a)
end
function evaluate(test_x, test_y, biases, weights)
test_results = [argmax(predict(col, biases, weights))-1 for col in eachcol(test_x)]
return sum(test_results.==test_y)
end
function cost_prime(output, y)
return output - y
end
function backpropagation(x, y, biases, weights)
nabla_b = [zeros(size(b)) for b in biases];
nabla_w = [zeros(size(w)) for w in weights];
a = x;
as = [a];
zs = Vector{Float64}[];
for (b, w) in zip(biases, weights)
z = w * a + b;
push!(zs, z);
a = sigmoid.(z);
push!(as, vec(a));
end
delta = cost_prime.(as[end], y) .* sigmoid_prime.(zs[end]);
nabla_b[end] = delta;
nabla_w[end] = delta*as[end-1]';
for l=1:length(zs)-1
delta = weights[end-l+1]' * delta .* sigmoid_prime.(zs[end-l]);
nabla_b[end-l] = delta;
nabla_w[end-l] = delta * as[end-l-1]';
end
return (nabla_b, nabla_w);
end
function update_mini_batch(train_x_minibatch, train_y_minibatch, mini_batch_size, eta, biases, weights)
nabla_b = [zeros(size(b)) for b in biases];
nabla_w = [zeros(size(w)) for w in weights];
for i=1:mini_batch_size
(delta_nabla_b, delta_nabla_w) = backpropagation(train_x_minibatch[:, i], train_y_minibatch[i], biases, weights);
nabla_b = [nb + dnb for (nb, dnb) in zip(nabla_b, delta_nabla_b)]; nabla_w = [nw + dnw for (nw, dnw) in zip(nabla_w, delta_nabla_w)]; # Loop over layers
end
biases = [b - nb*eta/mini_batch_size for (b, nb) in zip(biases, nabla_b)];
weights = [w - nw*eta/mini_batch_size for (w, nw) in zip(weights, nabla_w)];
return (biases, weights);
end
function SGD(train_x, train_y, epochs, mini_batch_size, eta, biases, weights, test_x, test_y)
n_test = length(test_y);
n_train = length(train_y);
for j=1:epochs
train_x_minibatches = [train_x[:, k:k+mini_batch_size-1] for k=1:mini_batch_size:n_train];
train_y_minibatches = [train_y[k:k+mini_batch_size-1] for k=1:mini_batch_size:n_train];
for mini_batch=1:length(train_y_minibatches)
(biases, weights) = update_mini_batch(train_x_minibatches[mini_batch], train_y_minibatches[mini_batch], mini_batch_size, eta, biases, weights);
end
res = evaluate(test_x, test_y, biases, weights);
print("Epoch $j: $res / $n_test\n");
end
return (biases, weights);
end
layer_sizes = [784, 30, 10];
biases = [randn(l) for l in layer_sizes[2:end]];
weights = [randn(ll, l) for (l, ll) in zip(layer_sizes[1:end-1], layer_sizes[2:end])];
@time (biases_opt, weights_opt) = SGD(train_x, train_y, 5, 10, 3.0, biases, weights, test_x, test_y);
```