 # My own Feedforward Neural Network library :-)

I am learning ML, so I created this toy FNN library… Surely is not optimised like in real ML libraries, but it is already pretty flexible and its simplicity may come on hand in implementing custom solutions…

``````
using Random

## Some utility functions..
import Base.reshape
"""
reshape(myNumber, dims..) - Reshape a number as a n dimensional Array
"""
function reshape(x::T, dims...) where {T <: Number}
x = [x]
reshape(x,dims)
end
function makeColVector(x::T) where {T <: Number}
return [x]
end
function makeColVector(x::T) where {T <: AbstractArray}
reshape(x,length(x))
end
function makeRowVector(x::T) where {T <: Number}
return [x]'
end
function makeRowVector(x::T) where {T <: AbstractArray}
reshape(x,1,length(x))
end

"""
Layer

Representation of a layer in the network

# Fields:
* `w`:  Weigths matrix with respect to the input from previous layer or data (n pr. layer x n)
* `wb`: Biases (n)
* `f`:  Activation function
* `df`: Derivative of the activation function
"""
mutable struct Layer
w::Array{Float64,2}
wb::Array{Float64,1}
f::Function
df::Function
end

"""
FNN

Representation of a Forward Neural Network

# Fields:
* `layers`:  Array of layers objects
* `cf`:      Cost function
* `dcf`:     Derivative of the cost function
* `trained`: Control flag for trained networks
"""
mutable struct FNN
layers::Array{Layer,1}
cf::Function
dcf::Function
trained::Bool
end

"""
buildLayer(f,df,n,nₗ;w,wb)

Instantiate a new layer

Parameters:
* `f`:  Activation function
* `df`: Derivative of the activation function
* `n`:  Number of nodes
* `nₗ`: Number of nodes of the previous layer
* `w`:  Initial weigths with respect to input [default: `rand(nₗ,n)`]
* `wb`: Initial weigths with respect to bias [default: `rand(n)`]

"""
function buildLayer(f,df,n,nₗ;w=rand(nₗ,n),wb=rand(n))
# To be sure w is a matrix and wb a column vector..
w  = reshape(w,nₗ,n)
wb = reshape(wb,n)
return Layer(w,wb,f,df)
end

"""
buildNetwork

Instantiate a new Feedforward Neural Network

Parameters:
* `layers`:  Array of layers objects
* `cf`:      Cost function
* `dcf`:     Derivative of the cost function

# Notes:
* Even if the network ends with a single output note, the cost function and its
derivative should always expect y and ŷ as column vectors.
"""
function buildNetwork(layers,cf,dcf)
return FNN(layers,cf,dcf,false)
end

"""
predict(layer,x)

Layer prediction of a single data point

# Parameters:
* `layer`:  Worker layer
* `x`:      Input to the layer
"""
function predict(layer::Layer,x)
return layer.f.((reshape(x,1,length(x))*layer.w)' + layer.wb)
end

"""
predict(fnn,x)

Network prediction of a single data point

# Parameters:
* `fnn`:  Worker network
* `x`:    Input to the network
"""
function predict(fnn::FNN,x)
makeColVector(x)
values = x
for l in fnn.layers
values = predict(l,values)
end
return values
end

"""
error(fnn,x,y)

Compute network loss on a single data point

# Parameters:
* `fnn`: Worker network
* `x`:   Input to the network
* `y`:   Label input
"""
function error(fnn::FNN,x,y)
x = makeColVector(x)
y = makeColVector(y)
ŷ = predict(fnn,x)
return fnn.cf(ŷ,y)
end

"""
errors(fnn,x,y)

Compute avg. network loss on a test set

# Parameters:
* `fnn`: Worker network
* `x`:   Input to the network (n x d)
* `y`:   Label input (n) or (n x d)
"""
function errors(fnn::FNN,x,y)
fnn.trained ? "" : @warn "Seems you are trying to test a neural network that has not been trained. Use first `train!(rnn,x,y)`"
ϵ = 0
for i in 1:size(x)
xᵢ = x[i,:]'
yᵢ = y[i,:]'
ϵ += error(fnn,xᵢ,yᵢ)
end
return ϵ/size(x)
end

"""
getW(fnn)

Retrieve current weigthts

# Parameters:
* `fnn`: Worker network

# Notes:
* The output is a vector of tuples of each layer's input weigths and bias weigths
"""
function getW(fnn)
w = Tuple{Array{Float64,2},Array{Float64,1}}[]
for l in fnn.layers
push!(w,(l.w,l.wb))
end
return w
end

"""
getDW(fnn,x,y)

Retrieve the current gradient of the weigthts (i.e. derivative of the cost with respect to the weigths)

# Parameters:
* `fnn`: Worker network
* `x`:   Input to the network
* `y`:   Label input

#Notes:
* The output is a vector of tuples of each layer's input weigths and bias weigths
"""
function getDW(fnn,x,y)
x = makeColVector(x)
y = makeColVector(y)
lz = Array{Float64,1}[]
lo = Array{Float64,1}[]
dW = Tuple{Array{Float64,2},Array{Float64,1}}[]

push!(lz,x)
push!(lo,x)

for l in fnn.layers
x = lo[end]
z = dropdims((reshape(x,1,length(x))*l.w)' + l.wb,dims=2)
o = l.f.(z)
push!(lz, z)
push!(lo, o)
end
dc = fnn.dcf(lo[end],y)
δ = dc # derivative of the cost function with respect to the layer output

# backpropagation step
for lidx in length(fnn.layers):-1:1
l = fnn.layers[lidx]
# Note that lz and lo vectors includes x, so the second layer is the third element in the vector
dwb = l.df.(lz[lidx+1]) .* δ # derivative with respect to the layer biases
dw = lo[lidx] * dwb'         # derivative with respect to the layer input weigths
push!(dW,(dw,dwb))
# Computing derivatives of the cost function with respect of the output of the previous layer
δ = l.w * dwb
end
return dW[end:-1:1] # reversing it, to start from the first layer
end

"""
updateWeights!(fnn,w)

Update weigths of the network

# Parameters:
* `fnn`: Worker network
* `w`:   The new weights to set
"""
function updateWeights!(fnn,w)
for lidx in 1:length(fnn.layers)
fnn.layers[lidx].w = w[lidx]
fnn.layers[lidx].wb = w[lidx]
end
end

"""
train!(fnn,x,y;epochs,η,rshuffle)

Train a fnn with the given x,y data

# Parameters:
* `fnn`:      Worker network
* `x`:        Training input to the network (records x dimensions)
* `y`:        Label input (records)
* `epochs`:   Number of passages over the training set [def = `1000`]
* `η`:        Learning rate. If not provided 1/(1+epoch) is used [def = `nothing`]
* `rshuffle`: Whether to random shuffle the training set at each epoch [def = `true`]
"""
function train!(fnn,x,y;epochs=1000, η=nothing, rshuffle=true)
logStep = Int64(ceil(epochs/100))
dyn_η = η == nothing ? true : false
for t in 1:epochs
if rshuffle
# random shuffle x and y
ridx = shuffle(1:size(x))
x = x[ridx, :]
y = y[ridx , :]
end
ϵ = 0
η = dyn_η ? 1/(1+t) : η
for i in 1:size(x)
xᵢ = x[i,:]'
yᵢ = makeColVector(y[i])
w  = getW(fnn)
dW = getDW(fnn,xᵢ,yᵢ)
for (lidx,l) in enumerate(fnn.layers)
l.w  = l.w -  η .* dW[lidx]
l.wb = l.wb - η .* dW[lidx]
end
ϵ += error(fnn,xᵢ,yᵢ)
end
(t % logStep == 0) || t == 1 || t == epochs ? println("Avg. error after epoch \$t : \$(ϵ/size(x))") : ""
end
fnn.trained = true
end

# ==================================
# Specific implementation - FNN definition
# ==================================

# Defining the functions we fill use as activation function as well their derivatives
# (yes, we could have used instead an automatic differentiation - AD - library..)
relu(x)     = max(0,x)
drelu(x)    = x <= 0 ? 0 : 1
linearf(x)  = x
dlinearf(x) = 1
cost(ŷ,y)   = (1/2)*(y-ŷ)^2
dcost(ŷ,y)  = [- (y-ŷ)]

l1 = buildLayer(relu,drelu,3,2,w=[1 1 1;1 1 1],wb=[0,0,0])
l2 = buildLayer(linearf,dlinearf,1,3,w=[1,1,1],wb=0)
myfnn = buildNetwork([l1,l2],cost,dcost)

# ==================================
# Usage of the FNN
# ==================================

xtrain = [2 1; 3 3; 4 5; 6 6]
ytrain = [10,21,32,42]
ytrain = [14,21,28,42]
xtest  = [1 1; 2 2; 3 3; 5 5; 10 10]
ytest  = [7,14,21,35,70]

train!(myfnn,xtrain,ytrain,epochs=10,η=0.001,rshuffle=false) # 1.86
errors(myfnn,xtest,ytest) # 0.108

dtanh(x)    = 1-tanh(x)^2
l1 = buildLayer(tanh,dtanh,3,2)
l2 = buildLayer(linearf,dlinearf,1,3)
myfnn2 = buildNetwork([l1,l2],cost,dcost)

train!(myfnn2,xtrain,ytrain,epochs=10000,η=0.001,rshuffle=false) # 0.011
errors(myfnn2,xtest,ytest) # 76.9
``````
8 Likes

Cool! Now toss it into a package, write some tests, and get it registered 7 Likes

There is an error there… I am gonna correct it… I am clearly random shufling X and Y… in a different step… (CORRECTED)

Hi @sylvaticus What a nice NN example!. I have a question regarding updateWeights! function, as I could not find where it is applyed. It seems that no other function (train!, or any other) uses it. Am I correct? What is it for? Thanks in advance and I hope you are doing Ok.

Hi there.
You are right, I used that function just to get some debugging information while developing `train!()`, it can be removed.

Indeed I would like to refactor it such that the get/set weigths and backpropagation are all done at the layer level, so that I can rewrite the layer as AbstractLayer and what I have here is just a fullyConnected layer. It will then be easy to add other layer types, e.g. convolutional, pooling layers or just normal layers without the bias parameter…

By the way I am following the MITx 6.86x - Machine Learning with Python: from Linear Models to Deep Learning course, and this was one of the subject.

I reimplemented everything in Julia and created two repositories:

• Learning Machine Learning. In Julia where I am hosting the code: neural networks (this code), K-Means,K-Menoids,Expectation-Maximisation algorithm for Gaussian Mixture Models and Collaborative Filtering (plus stuff I have done earlier and need just to move in the repository, like Perceptron and Supporting Vector Machine)
• full notes of the MITx 6.86x course, the companion repository where to get the theoretical explanation of the algorithms

Again, the objective is not to create yet an other machine learning package, there are excellent ones already in Julia (and I surely not have the competence to create one), but just to have some easy to understand examples of the basic algorithms employed.

2 Likes

Hi @sylvaticus Thanks for the quick response. It was very helpful. Based on what you mentioned, I continued inspecting your program (as I am currently trying to understand the NN basics) and some details were brought my attention:

1. What is the reason you multiply xW on the line `z = dropdims((reshape(x,1,length(x))*l.w)' + l.wb,dims=2)` of function getDW?. It is my understanding that literature defines the linear step as z=Wx+b (right multiplication instead of left). Despite in this case, dimensions are set to match, derivatives for (Wx) are different than those for (xW), which are important for backpropagation. I am probably not gettingthis right…Do you have any insights on this matter?

2. On the example, you do:

``````l1 = buildLayer(tanh,dtanh,3,2)
l2 = buildLayer(linearf,dlinearf,1,3)

myfnn2 = buildNetwork([l1,l2],cost,dcost)
``````

Here, you made an array [l1,l2] of type `Array{Layer,1}`. Is there any way I can create an empty array of type `Layer` and pushes some elements into it?. For instace, something like this…

``````l=Array{Layer,1}[];
for j=1:Nₗ-1;
curr_lay = buildLayer(relu,drelu,lay_dim[j+1],lay_dim[j]);
push!(l,curr_lay)
end
``````

I have all sort of issues putting elements curr_lay (of type Layer) in an array…can’t figure out. How can I do this?

1. It just all depends on how you define the matrix W, if `(nFrom,nTo)` or `(nTo,nFrom)` where `nFrom` and `nTo` are the number of nodes in the previous and in the current layer.

If you use the first definition (as I do, I don’t know if the literature takes the other way as a standard) the input `x` has dimensions `nFrom` and you want your output to have dimensions `nTo`, so you use `(x' W)'` … maybe this is why in literature it is used the other definition… I’ll change it to be more clear (and save a bit of computations in reshaping the matrices) but it is obviously the same…

1. You have to consider the dimensions of your X for the first layer, and then you can go:
``````Nₗ      = 5
l       = Layer[];
lay_dim = [2,3,4,5,2]
dimX    = 5
for j=1:Nₗ;
prevDim  = (j == 1 ? dimX : lay_dim[j-1])
curr_lay = buildLayer(relu,drelu,lay_dim[j],prevDim);
push!(l,curr_lay)
end
``````

@sylvaticus . Thanks for the answer. Now its getting clearer. Btw, it would be great to arrange the program to do `(Wx)` instead of `(xW)`, but thats just matter of taste…

Again, I am very grateful for the response!. Cheers!

I updated the code to:

• changed weigths matrices from (from,to) to (to,from) as to allow Wx instead of (x’W)’
• generalised the code so that different kind of layers (each with different activaton functions) can be implemented. By default we have FullyConnectedLayer and NoBiasLayer, but one can introduce its own layer type implementing a few methods
• moved utility functions in a separate file
1 Like

Sylvaticus (agaricus)- Copied and pasted your original program (from link in ‘Medium’ - and lo and behold it ran right after precompilation. Impressive! And thank you - it is a very good framework. After downloading the updated version from github - it cannot pass ‘abstract type Layer end’ @ line 56 w/o “ERROR: LoadError: invalid redefinition of constant Layer.” I’ve Julia 1.4.1 - I realize you’re busy … and I may continue to use your version that worked for me … but have you any idea why this happening? I saw your new book - it looks like a lot of detail. Keep up the good work - Mike Williams

That is just because you must already have an “object” with that name in memory.
Just restart Julia, you should not have that error any more.

1 Like

By the way, if you liked the book, you could write a review in Amazon… still I don’t have any :-/

Thank you - I’m using Juno and had another pkg open that conflicted. Not sure if you use an IDE or stick to the REPL but in Juno there were a few errors in # Individual components debugging stuff:
l1 = FullyConnectedLayer(relu,drelu,2,3,w=[1 2; -1 -2; 3 -3],wb=[1,-1,0]) - threw:
"ERROR: LoadError: MethodError: no method matching FullyConnectedLayer(::typeof(relu), ::typeof(drelu), ::Int64, ::Int64; w=[1 2; -1 -2; 3 -3], wb=[1, -1, 0])
Closest candidates are:
FullyConnectedLayer(::Any, ::Any, ::Any; w, wb, df) at /home/lazer/.julia/dev/lmlj.jl-master/src/nn.jl:92
And a similar error with:
l2 = NoBiasLayer(linearf,dlinearf,3,2,w=[1 2 3; -1 -2 -3]) — And

ERROR: LoadError: DimensionMismatch(“dimensions must match: a has dims (Base.OneTo(2),), b has dims (Base.OneTo(1),), mismatch at 1”)
Stacktrace:
 squaredCost(::Array{Float64,1}, ::Array{Int64,1}) at /home/lazer/.julia/dev/lmlj.jl-master/src/utilities.jl:34
And:
€_do1 = backward(l2,d€_do2,o1) @ line 575 — which threw:
“ERROR: LoadError: DimensionMismatch(“second dimension of A, 1, does not match length of x, 2”)”
Stacktrace:"

“ERROR: LoadError: MethodError: no method matching buildNetwork(::Array{FullyConnectedLayer,1}, ::typeof(squaredCost), ::typeof(dSquaredCost))
Closest candidates are:
buildNetwork(::Any, ::Any; dcf, name) at /home/lazer/.julia/dev/lmlj.jl-master/src/nn.jl:279”

“ERROR: LoadError: DimensionMismatch(“dimensions must match: a has dims (Base.OneTo(2),), b has dims (Base.OneTo(1),), mismatch at 1”)
Stacktrace:
 promote_shape at ./indices.jl:178 [inlined]
 promote_shape at ./indices.jl:169 [inlined]
 -(::Array{Int64,1}, ::Array{Float64,1}) at ./arraymath.jl:38
 squaredCost(::Array{Float64,1}, ::Array{Int64,1}) at /home/lazer/.julia/dev/lmlj.jl-master/src/utilities.jl:34
 error(::NN, ::Array{Int64,1}, ::Array{Int64,1}) at /home/lazer/.julia/dev/lmlj.jl-master/src/nn.jl:312”

Line numbers may be tad different because I added a few comments. These is just an FYI - I’m still getting used to Julia and need to pay my dues - and I’ll probably used similar test methods.

As for your book: I plan on purchasing it in the next few days and will be happy to leave a review - though perhaps from a simpler perspective. All the best - Mike

Thank you, that code under “debugging stuff” was not updated with the latest API. If you notice the definition of the layer constructor it is now `FullyConnectedLayer(f,n,nₗ;w,wb,df)` as I introduced automatic differentiation using the Zygote package, so the old positional parameter of the derivative of the activation function is now an optional parameter (in julia all function parameters before the semicolon `;` are positional arguments, while those after it are keywork arguments.

The correct call for that example is then `l1 = FullyConnectedLayer(relu,2,3,w=[1 2; -1 -2; 3 -3],wb=[1,-1,0],df=drelu)` if you want to provide manual derivative or just `l1 = FullyConnectedLayer(relu,2,3,w=[1 2; -1 -2; 3 -3],wb=[1,-1,0])` if you are happy to use AD.

I updated the GIT repository with the correct call. I am still heavily refactoring the code, this is why I didn’t yet created a Julia package (I’ll do when things will be a bit more stable, but again the scope is not those to create yet an other machine learning library - flux and knet are great, just to learn myself the most common algorithms).

Thanks much for the clarification - I hadn’t researched the Zygote pkg yet - but saw a comment to you about AD. I had planned on rerunning with mods to the methods today. You seem to be understanding many concepts rapidly - and are much more adept at Julia than myself. It seems a good choice for me coming from C++/Java - and seems to offer alternatives to satisfy my projects embedded CPU/GPU arch and concurrency requirements. Time will tell. I appreciate your help - BTW I purchased your book from Amazon today. Take care - Mike

I think Julia is a good choice for ML… one very nice thing is that, contrary to Python, the mayor ML libraries in Julia (like Flux or Knet) are written in Julia itself… so if you master the language you can not just “use” them but adapt them to your own project (see also my post on Medium on this).

PS: Thank you for the book purchase, but I hope you didn’t feel obliged… we aren’t here to sell books…

1 Like

The code started in this thread advanced a fair amount, and I am almost there to release it as a package.

This stuff most likely has value only didactically, as the approaches are the “vanilla” ones, i.e. the simplest possible ones, and GPU is not supported here.
For “serious” machine learning work in Julia I suggest to use either Flux or Knet.

As the focus is mainly didactic, functions have pretty longer but more explicit names than usual… for example the `Dense` layer is a `DenseLayer`, the `RBF` kernel is `radialKernel`, etc.

That said, Julia is a relatively fast language and most hard job is done in multithreaded functions or using matrix operations whose underlying libraries are multithreaded, so it is reasonably fast for small exploratory tasks. Also it is already very flexible. For example, one can implement its own layer as a subtype of the abstract type `Layer` or its own optimisation algorithm as a subtype of `OptimisationAlgorithm`.

Here is an example from the iris RDataset:

#### Using an Artificial Neural Network for multinomial categorisation

``````# Load Modules
using Bmlt.Nn, DelimitedFiles, Random, StatsPlots # Load the main module and ausiliary modules
Random.seed!(123); # Fix the random seed (to obtain reproducible results)

iris     = iris[shuffle(axes(iris, 1)), :] # Shuffle the records, as they aren't by default
x        = convert(Array{Float64,2}, iris[:,1:4])
y        = map(x->Dict("setosa" => 1, "versicolor" => 2, "virginica" =>3)[x],iris[:, 5]) # Convert the target column to numbers
y_oh     = oneHotEncoder(y) # Convert to One-hot representation (e.g. 2 => [0 1 0], 3 => [0 0 1])

# Split the data in training/testing sets
ntrain    = Int64(round(size(x,1)*0.8))
xtrain    = x[1:ntrain,:]
ytrain    = y[1:ntrain]
ytrain_oh = y_oh[1:ntrain,:]
xtest     = x[ntrain+1:end,:]
ytest     = y[ntrain+1:end]

# Define the Artificial Neural Network model
l1   = DenseLayer(4,10,f=relu) # Activation function is ReLU
l2   = DenseLayer(10,3)        # Activation function is identity by default
l3   = VectorFunctionLayer(3,3,f=softMax) # Add a (parameterless) layer whose activation function (softMax in this case) is defined to all its nodes at once
mynn = buildNetwork([l1,l2,l3],squaredCost,name="Multinomial logistic regression Model Sepal") # Build the NN and use the squared cost (aka MSE) as error function

# Training it (default to SGD)
res = train!(mynn,scale(xtrain),ytrain_oh,epochs=100,batchSize=6) # Use optAlg=SGD (Stochastic Gradient Descent) by default

# Test it
ŷtrain        = predict(mynn,scale(xtrain))   # Note the scaling function
ŷtest         = predict(mynn,scale(xtest))
trainAccuracy = accuracy(ŷtrain,ytrain,tol=1) # 0.983
testAccuracy  = accuracy(ŷtest,ytest,tol=1)   # 1.0

# Visualise results
testSize = size(ŷtest,1)
ŷtestChosen =  [argmax(ŷtest[i,:]) for i in 1:testSize]
groupedbar([ytest ŷtestChosen], label=["ytest" "ŷtest (est)"], title="True vs estimated categories") # All records correctly labelled !
plot(0:res.epochs,res.ϵ_epochs, ylabel="epochs",xlabel="error",legend=nothing,title="Avg. error per epoch on the Sepal dataset")
``````  1 Like

@sylvaticus Its me again!.
I have installed the under-development (in fact, the “stable package”) package BetaML. The example runs smoothly!. Now, I have been trying to use the old version and I was struggling to get a good prediction over a simple training data set of my own invention. I generated.

``````xtrain = pi*rand(10000,5);
ytrain = sin.(xtrain) + 0.5 * cos.(xtrain);
``````

So, actually, it represents a combination of basic trigonometric functions. I was then trying to test it with a subset like the one below…

``````xtest = pi*rand(1,5);
ytest = sin.(xtest) + 0.5 * cos.(xtest);
``````

and I get in green, the training data.
in red, the testing data,
in blue: the prediction. (obtained with `Predict(myfnn,xtest)`)

As you can see, its not good!. I am probably doing something wrong

My question is, can yo run this example using the new version, so I can see how it is used?.