Julia Performance - Help Needed

(I deleted the above post because I found an error in my code, it’s fixed now)

As others have mentioned, optimized Python code translated literally is often very slow Julia code. In Julia, you want to write loops, while in Python you have to avoid them. Having changed a few lines of vector operations to loops, the Julia implementation becomes faster than the Python one. Note that I did not change the algorithm, I just expanded what numpy is doing internally. The new code runs in 30s on my computer compared to 40s for python. I just took the most low-hanging fruit. I believe that, with some effort, another factor of 2 might be possible.

code

using MLDatasets, Random, BenchmarkTools, LoopVectorization, LinearAlgebra

function vectorized_result(y)
    e = zeros(10)
    e[y + 1] = 1
    return e
end

process_sample(x, y) = collect(reshape(x, 28 * 28, 1)), vectorized_result(y)

@inline sigmoid(z) = 1 / (1 + exp(-z))

@inline function sigmoid_prime(z)
    s = sigmoid(z)
    return s * (1 - s)
end

function predict(a, biases, weights)
    for (b, w) in zip(biases, weights)
        a = sigmoid.(w * a .+ b)
    end
    return vec(a)
end

function evaluate(test_data, biases, weights)
    test_results = [(argmax(predict(x, biases, weights)), y) for (x, y) in test_data]
    return sum(y[pred] for (pred, y) in test_results)
end

cost_prime(output, y) = output - y


function myproduct(out, A, x, b)
    (Nx, Ny) = size(A)
    for i = 1:Ny
        out_i = 0.0
        for j = 1:Nx
            out_i += A[j,i]*x[j]
        end
        out[i] = out_i
    end
    for i = 1:Ny
        out[i] *= sigmoid_prime(b[i])
    end
end

function backpropagation!(delta_nabla_b, delta_nabla_w, delta, x, y, biases, weights)
    a = x
    as = [a]
    zs = Matrix{Float64}[]
    for (b, w) in zip(biases, weights)
        z = w * a + b
        push!(zs, z)
        a = sigmoid.(z)
        push!(as, a)
    end
    delta[1] .= cost_prime.(as[end], y) .* sigmoid_prime.(zs[end])
    delta_nabla_b[end] .= delta[1]
    delta_nabla_w[end] .= delta[1] * as[end - 1]'
    for l in 1:length(zs) - 1
        myproduct(delta[l+1], weights[end - l + 1], delta[l], zs[end - l])
        delta_nabla_b[end - l] .= delta[l+1]
        mul!(delta_nabla_w[end - l], delta[l+1], as[end - l - 1]')
    end
    return delta_nabla_b, delta_nabla_w
end

function update_mini_batch(minibatch, eta, biases, weights, delta_nabla_b, delta_nabla_w, delta)
    for (x, y) in minibatch
        delta_nabla_b, delta_nabla_w = backpropagation!(delta_nabla_b, delta_nabla_w, delta, x, y, biases, weights)
        # biases  = [b - (eta / length(minibatch)) * nb for (b, nb) in zip(biases, delta_nabla_b)]
        # weights = [w - (eta / length(minibatch)) * nw for (w, nw) in zip(weights, delta_nabla_w)]
        for i in eachindex(biases)
            for j in eachindex(biases[i])
                biases[i][j] -= (eta / length(minibatch)) * delta_nabla_b[i][j]
            end
        end
        for i in eachindex(weights)
            @turbo for j in eachindex(weights[i])
                weights[i][j] -= (eta / length(minibatch)) * delta_nabla_w[i][j]
            end
        end
    end
    return biases, weights
end

function SGD(train_data, epochs, mini_batch_size, eta, biases, weights, test_data, delta_nabla_b, delta_nabla_w, delta)
    n_train = length(train_data)
    for j in 1:epochs
        shuffle!(train_data)
        train_minibatches = [train_data[k:k + mini_batch_size - 1] for k in 1:mini_batch_size:n_train]
        for minibatch in train_minibatches
            (biases, weights) = update_mini_batch(minibatch, eta, biases, weights, delta_nabla_b, delta_nabla_w, delta)
        end
        res = evaluate(test_data, biases, weights)
        print("Epoch $j: $res / $(length(test_data))\n")
    end
    return biases, weights
end


let
    Random.seed!(1234)
    train_x, train_y = MNIST.traindata(Float64, 1:50_000);
    train_data = process_sample.(eachslice(train_x, dims=3), train_y);

    valid_x, valid_y = MNIST.traindata(Float64, 50_001:60_000);
    valid_data = process_sample.(eachslice(valid_x, dims=3), valid_y);

    test_x, test_y = MNIST.testdata(Float64);
    test_data = process_sample.(eachslice(test_x, dims=3), test_y);

    layer_sizes = [784, 30, 10];
    biases = [randn(l, 1) for l in layer_sizes[2:end]];
    weights = [randn(ll, l) for (l, ll) in zip(layer_sizes[1:end - 1], layer_sizes[2:end])];
    delta_nabla_b = [similar(b) for b in biases]
    delta_nabla_w = [similar(w) for w in weights]
    delta = [similar(b) for b in biases[end:-1:1]]
    SGD(train_data, 1, 10, 3.0, biases, weights, test_data, delta_nabla_b, delta_nabla_w, delta);
    @time SGD(train_data, 10, 10, 3.0, biases, weights, test_data, delta_nabla_b, delta_nabla_w, delta);
    #@profview SGD(train_data, 1, 10, 3.0, biases, weights, test_data);
end

Edit: with an extra @turbo remaining time is 17s.

2 Likes