I just translated a neuronal network from Python to Julia. It is already significantly faster than the Python NN, but written in Pascal it is still almost 20 times faster than the Julia version. By using the @time macro I identified this code block as the main source of slow performance:
function dosomething(weights::Vector{Array{Float64,2}}, bias::Vector{Array{Float64,1}}, lR::Real, dError::Vector{Array{Float64,1}}, inputs::Vector{Array{Float64,1}})
lLay = length(dError)
m, n = size(weights)
for i in 1:lLay
for m in 1:(size(weights[i])[1])
bias[i][m] -= lR * dError[i][m]
for n in 1:(size(weights[i])[2])
weights[i][m, n] -= lR * dError[i][m] * inputs[i][n]
end
end
end
end
This code block takes 0.2 seconds on my computer whereas all other parts of the NN take less than 0.1 seconds to run. I tried to get better performance with @simd and @inbounds but that didn’t achieve anything. Neither did changing the loop order from row to column:
function dosomething2(weights::Vector{Array{Float64,2}}, bias::Vector{Array{Float64,1}}, lR::Real, dError::Vector{Array{Float64,1}}, inputs::Vector{Array{Float64,1}})
lLay = length(dError)
for i in 1:lLay
for m in 1:(size(weights[i])[1])
bias[i][m] -= lR * dError[i][m]
end
end
for i in 1:lLay
for n in 1:(size(weights[i])[2])
for m in 1:(size(weights[i])[1])
weights[i][m, n] -= lR * dError[i][m] * inputs[i][n]
end
end
end
end
Does it help if rather than using a Vector{Vector{Float64}} you use a Matrix{Float64}? Every time you access a different inner vector you’re doing a pointer reference, which is probably slowing things down.
If you want to keep your data structured the same way you could lift the pointer access out of the loop, e.g. convert
for m in 1:(size(weights[i])[1])
bias[i][m] -= lR * dError[i][m]
for n in 1:(size(weights[i])[2])
weights[i][m, n] -= lR * dError[i][m] * inputs[i][n]
end
end
to
for m in 1:(size(weights[i])[1])
bias[i][m] -= lR * dError[i][m]
w = weights[i]
d = dError[i]
i = inputs[i]
for n in 1:(size(weights[i])[2])
w[m, n] -= lR * d[m] * i[n]
end
end
I get 5x improvements compared to 2nd code with below
function dosomething2(weights::Vector{Array{Float64,2}}, bias::Vector{Array{Float64,1}}, lR::Real, dError::Vector{Array{Float64,1}}, inputs::Vector{Array{Float64,1}})
lLay = length(dError)
@inbounds for i = 1:lLay
biasi = bias[i]
dErrori = dError[i]
for m = 1:size(biasi,1)
biasi[m] -= lR * dErrori[m]
end
end
@inbounds for i = 1:lLay
weightsi = weights[i]
dErrori = dError[i]
inputsi = inputs[i]
for n = 1:size(weightsi,2)
for m = 1:size(weightsi,1)
weightsi[m, n] -= lR * dErrori[m] * inputsi[n]
end
end
end
end
julia> function dosomething(weights::Vector{Array{Float64,2}}, bias::Vector{Array{Float64,1}}, lR::Real, dError::Vector{Array{Float64,1}}, inputs::Vector{Array{Float64,1}})
lLay = length(dError)
for i in 1:lLay
for m in 1:(size(weights[i])[1])
bias[i][m] -= lR * dError[i][m]
for n in 1:(size(weights[i])[2])
weights[i][m, n] -= lR * dError[i][m] * inputs[i][n]
end
end
end
end
dosomething (generic function with 1 method)
julia> function dosomething2(weights::Vector{Array{Float64,2}}, bias::Vector{Array{Float64,1}}, lR::Real, dError::Vector{Array{Float64,1}}, inputs::Vector{Array{Float64,1}})
lLay = length(dError)
for i in 1:lLay
BLAS.axpy!(-lR, dError[i], bias[i])
BLAS.ger!(-lR, dError[i], inputs[i], weights[i])
end
end
dosomething2 (generic function with 1 method)
julia> w = [rand(100, 100) for _ in 1:20]; b = [rand(100) for _ in 1:20];
julia> i = [rand(100) for _ in 1:20]; e = [rand(100) for _ in 1:20];
julia> using BenchmarkTools
julia> @btime dosomething($w, $b, .1, $e, $i)
430.861 μs (0 allocations: 0 bytes)
julia> @btime dosomething2($w, $b, .1, $e, $i)
56.379 μs (0 allocations: 0 bytes)
Btw. for neural networks in julia you may also be interested in looking at Flux or Knet.