Test accuracy decrease during training iteration - machine-learning

In my neural network model, the test accuracy decrease in the iteration. I have check the learning rate and tune it smaller, but my test accuracy keep decreasing but not oscillating, so I think it is not the cause of the problem.
I use tempotron learning rule, and work on Iris dataset, which I use 100 training samples and 50 testing samples.
I have check my code, the test accuracy have increased at the beginning, so I think the learning rule do work on the weight.
But I can't figure out why the performance decrease after that.
Can someone have any ideas?
Thanks.
testing accuracy
for Iterate = 1:iteration %% Run 100 times
%% Test the correct rate each time
correct = 0;
for test_sample = 1:length(test)
% In each iteration, T = 100ms
for t = 1:T
for neuron = 1:neurons %% Response function for 48 neurons at time t
Response(neuron) = K(t,test(test_sample,neuron));
end
% Calculate PSP
for j = 1:3
V(j,t) = Response*weight(:,j) + V_rest;
end
end
%% find t_max: first index that V cross threshold
for j = 1:3
for timing = 1:T
if V(j,timing) >= threshold
t_max(j) = timing;
Max_state(j) = V(j,timing);
break;
end
end
V(j,t_max(j):end) = V(j,t_max(j)).*exp(-(Time(t_max(j):end)-Time(t_max(j)))/Tou_m);
end
[~,output_class] = min(t_max);
if output_class == test_target(test_sample)
correct = correct + 1;
end
end
correct_rate(Iterate) = correct/(length(test));
if Iterate > 1
if correct_rate(Iterate) < correct_rate(Iterate-1)
fprintf('Correct rate decrease\n');
%break;
end
end
%% Training
for samples = 1:size(InputSpike,1) %% Training samples for each iteration
% In each iteration, T = 100ms
for t = 1:T
for neuron = 1:neurons %% Response function for 48 neurons at time t
Response(neuron) = K(t,InputSpike(samples,neuron));
end
% Calculate PSP
for j = 1:3
V(j,t) = Response*weight(:,j) + V_rest;
end
end
%% find t_max: first index that V cross threshold
for j = 1:3
for timing = 1:T
if V(j,timing) >= threshold
t_max(j) = timing;
Max_state(j) = V(j,timing);
break;
end
end
V(j,t_max(j):end) = V(j,t_max(j)).*exp(-(Time(t_max(j):end)-
Time(t_max(j)))/Tou_m);
end
[~,output_class] = min(t_max);
%% weight modify when error occurs
if train_target(samples) ~= output_class
for j = 1:3
if j == train_target(samples) %% error in target neuron
if Max_state(j) < threshold %% if P+ error occurs
for i = 1:neurons
%% for all t_i < t_max
if InputSpike(samples,i) < t_max(j)
%% weight modified
weight(i,j) = weight(i,j) + ...
lr*K(t_max(j),InputSpike(samples,i));
end
end
end
elseif j ~= train_target(samples) %% error on other 2 output neurons
if Max_state(j) >= threshold %% if P- error occurs
for i = 1:neurons
%% for all t_i < t_max
if InputSpike(samples,i) < t_max(j)
%% weight modified
weight(i,j) = weight(i,j) - ...
lr*K(t_max(j),InputSpike(samples,i));
end
end
end
end
end
%% for neurons that fired but weaker than target neuron
elseif train_target(samples) == output_class
for j = 1:3
if j ~= train_target(samples) %% other 2 output neurons
if Max_state(j) >= threshold
for i = 1:neurons %% P- error occurs
%% for all t_i < t_max
if InputSpike(samples,i) < t_max(j)
%% weight modified
weight(i,j) = weight(i,j) - ...
lr*K(t_max(j),InputSpike(samples,i));
end
end
end
end
end
end
end
end

You should enlarge your training dataset to avoid overfitting. You can also try to increase your training epoches.

Related

Flux.jl : Customizing optimizer

I'm trying to implement a gradient-free optimizer function to train convolutional neural networks with Julia using Flux.jl. The reference paper is this: https://arxiv.org/abs/2005.05955. This paper proposes RSO, a gradient-free optimization algorithm updates single weight at a time on a sampling bases. The pseudocode of this algorithm is depicted in the picture below.
optimizer_pseudocode
I'm using MNIST dataset.
function train(; kws...)
args = Args(; kws...) # collect options in a stuct for convinience
if CUDA.functional() && args.use_cuda
#info "Training on CUDA GPU"
CUDA.allwoscalar(false)
device = gpu
else
#info "Training on CPU"
device = cpu
end
# Prepare datasets
x_train, x_test, y_train, y_test = getdata(args, device)
# Create DataLoaders (mini-batch iterators)
train_loader = DataLoader((x_train, y_train), batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader((x_test, y_test), batchsize=args.batchsize)
# Construct model
model = build_model() |> device
ps = Flux.params(model) # model's trainable parameters
best_param = ps
if args.optimiser == "SGD"
# Regular training step with SGD
elseif args.optimiser == "RSO"
# Run RSO function and update ps
best_param .= RSO(x_train, y_train, args.RSOupdate, model, args.batchsize, device)
end
And the corresponding RSO function:
function RSO(X,L,C,model, batch_size, device)
"""
model = convolutional model structure
X = Input data
L = labels
C = Number of rounds to update parameters
W = Weight set of layers
Wd = Weight tensors of layer d that generates an activation
wid = weight tensor that generates an activation aᵢ
wj = a weight in wid
"""
# Normalize input data to have zero mean and unit standard deviation
X .= (X .- sum(X))./std(X)
train_loader = DataLoader((X, L), batchsize=batch_size, shuffle=true)
#println("model = $(typeof(model))")
std_prep = []
σ_d = Float64[]
D = 1
for layer in model
D += 1
Wd = Flux.params(layer)
# Initialize the weights of the network with Gaussian distribution
for id in Wd
wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), (3,3,4,4)))
id = wj
append!(std_prep, vec(wj))
end
# Compute std of all elements in the weight tensor Wd
push!(σ_d, std(std_prep))
end
W = Flux.params(model)
# Weight update
for _ in 1:C
d = D
while d > 0
for id in 1:length(W[d])
# Randomly sample change in weights from Gaussian distribution
for j in 1:length(w[d][id])
# Randomly sample mini-batch
(x, l) = train_loader[rand(1:length(train_loader))]
# Sample a weight from normal distribution
ΔWj[d][id][j] = rand(Normal(0, σ_d[d]), 1)
loss, acc = loss_and_accuracy(data_loader, model, device)
W = argmin(F(x,l, W+ΔWj), F(x,l,W), F(x,l, W-ΔWj))
end
end
d -= 1
end
end
return W
end
The problem here is the second block of the RSO function. I'm trying to evaluate the loss with the change of single weight in three scenarios, which are F(w, l, W+gW), F(w, l, W), F(w, l, W-gW), and choose the weight-set with minimum loss. But how do I do that using Flux.jl? The loss function I'm trying to use is logitcrossentropy(ŷ, y, agg=sum). In order to generate y_hat, we should use model(W), but changing single weight parameter in Zygote.Params() form was already challenging....
Based on the paper you shared, it looks like you need to change the weight arrays per each output neuron per each layer. Unfortunately, this means that the implementation of your optimization routine is going to depend on the layer type, since an "output neuron" for a convolution layer is quite different than a fully-connected layer. In other words, just looping over Flux.params(model) is not going to be sufficient, since this is just a set of all the weight arrays in the model and each weight array is treated differently depending on which layer it comes from.
Fortunately, Julia's multiple dispatch does make this easier to write if you use separate functions instead of a giant loop. I'll summarize the algorithm using the pseudo-code below:
for layer in model
for output_neuron in layer
for weight_element in parameters(output_neuron)
weight_element = sample(N(0, sqrt(2 / num_outputs(layer))))
end
end
sigmas[layer] = stddev(parameters(layer))
end
for c in 1 to C
for layer in reverse(model)
for output_neuron in layer
for weight_element in parameters(output_neuron)
x, y = sample(batches)
dw = N(0, sigmas[layer])
# optimize weights
end
end
end
end
It's the for output_neuron ... portions that we need to isolate into separate functions.
In the first block, we don't actually do anything different to every weight_element, they are all sampled from the same normal distribution. So, we don't actually need to iterate the output neurons, but we do need to know how many there are.
using Statistics: std
# this function will set the weights according to the
# normal distribution and the number of output neurons
# it also returns the standard deviation of the weights
function sample_weight!(layer::Dense)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 1)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
function sample_weight!(layer::Conv)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 4)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
sigmas = map(sample_weights!, model)
Now, for the second block, we will do a similar trick by defining different functions for each layer.
function optimize_layer!(loss, layer::Dense, data, sigma)
for i in 1:size(layer.weight, 1)
for j in 1:size(layer.weight, 2)
wj = layer.weight[i, j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
layer.weight[i, j] = w
losses[k] = loss(x, y)
end
layer.weight[i, j] = ws[argmin(losses)]
end
end
end
function optimize_layer!(loss, layer::Conv, data, sigma)
for i in 1:size(layer.weight, 4)
# we use a view to reference the full kernel
# for this output channel
wid = view(layer.weight, :, :, :, i)
# each index let's us treat wid like a vector
for j in eachindex(wid)
wj = wid[j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
wid[j] = w
losses[k] = loss(x, y)
end
wid[j] = ws[argmin(losses)]
end
end
end
for c in 1:C
for (layer, sigma) in reverse(zip(model, sigmas))
optimize_layer!(layer, data, sigma) do x, y
logitcrossentropy(model(x), y; agg = sum)
end
end
end
Notice that nowhere did I use Flux.params which does not help us here. Also, Flux.params would include both the weight and bias, and the paper doesn't look like it bothers with the bias at all. If you had an optimization method that generically optimized any parameter regardless of layer type the same (i.e. like gradient descent), then you could use for p in Flux.params(model) ....
Thanks #darsnack :)
I found your answer a bit late, so in the meantime I could figure out my own script that works. Mine is indeed a bit hardcoded but could you also give feedback on this?
function RSO(train_loader, test_loader, C,model, batch_size, device, args)
"""
model = convolutional model structure
C = Number of rounds to update parameters (epochs)
batch_size = size of the mini batch that will be used to calculate loss
device = CPU or GPU
"""
# Evaluate initial weight
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
println("Initial Weight:")
println(" test_loss = $test_loss, test_accuracy = $test_acc")
random_batch = []
for (x, l) in train_loader
push!(random_batch, (x,l))
end
# Initialize weights
std_prep = []
σ_d = Float64[]
D = 0
for layer in model
D += 1
Wd = Flux.params(layer)
# Initialize the weights of the network with Gaussian distribution
for id in Wd
if typeof(id) == Array{Float32, 4}
wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), size(id)))
elseif typeof(id) == Vector{Float32}
wj = convert(Vector{Float32}, rand(Normal(0, sqrt(2/length(id))), length(id)))
elseif typeof(id) == Matrix{Float32}
wj = convert(Matrix{Float32}, rand(Normal(0, sqrt(2/length(id))), size(id)))
end
id = wj
append!(std_prep, vec(wj))
end
# Compute std of all elements in the weight tensor Wd
push!(σ_d, std(std_prep))
end
# Weight update
for c in 1:C
d = D
# First update the weights of the layer closest to the labels
# and then sequentially move closer to the input
while d > 0
Wd = Flux.params(model[d])
for id in Wd
# Randomly sample change in weights from Gaussian distribution
for j in 1:length(id)
# Randomly sample mini-batch
(x, y) = rand(random_batch, 1)[1]
x, y = device(x), device(y)
# Sample a weight from normal distribution
ΔWj = rand(Normal(0, σ_d[d]), 1)[1]
# Weight update with three scenario
## F(x,l, W+ΔWj)
id[j] = id[j]+ΔWj
ŷ = model(x)
ls_pos = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
## F(x,l,W)
id[j] = id[j]-ΔWj
ŷ = model(x)
ls_org = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
## F(x,l, W-ΔWj)
id[j] = id[j]-ΔWj
ŷ = model(x)
ls_neg = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
# Check weight update that gives minimum loss
min_loss = argmin([ls_org, ls_pos, ls_neg])
# Save weight update with minimum loss
if min_loss == 1
id[j] = id[j] + ΔWj
elseif min_loss == 2
id[j] = id[j] + 2*ΔWj
elseif min_loss == 3
id[j] = id[j]
end
end
end
d -= 1
end
train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
track!(args.tracker, test_acc)
println("RSO Round=$c")
println(" train_loss = $train_loss, train_accuracy = $train_acc")
println(" test_loss = $test_loss, test_accuracy = $test_acc")
end
return Flux.params(model)
end

ReLU activation function outputs HUGE numbers

I have FINALLY been able to implement backpropagation, but there are still some bugs I need to fix. The main is issue the following: My ReLU activation function produces really big dJdW values (derivative of error function wrt weights). When this gets subtracted from the weights, my output becomes a matrix of -int or inf. How do I stop this? As of now, the only solution I have is to make my learning rate scalar variable REALLY small.
import numpy as np
class Neural_Network(object):
def __init__(self, input_, hidden_, output_, numHiddenLayer_, numExamples_):
# Define Hyperparameters
self.inputLayerSize = input_
self.outputLayerSize = output_
self.hiddenLayerSize = hidden_
self.numHiddenLayer = numHiddenLayer_
self.numExamples = numExamples_
self.learningRate = 0.000000001 # LEARNING RATE: Why does ReLU produce such large dJdW values?
self.weightDecay = 0.5
# in -> out
self.weights = [] # stores matrices of each layer of weights
self.z = [] # stores matrices of each layer of weighted sums
self.a = [] # stores matrices of each layer of activity
self.biases = [] # stores all biases
# Biases are matrices that are added to activity matrix
# Dimensions -> numExamples_*hiddenLayerSize or numExamples_*outputLayerSize
for i in range(self.numHiddenLayer):
# Biases for hidden layer
b = [np.random.random() for x in range(self.hiddenLayerSize)];
B = [b for x in range(self.numExamples)];
self.biases.append(np.mat(B))
# Biases for output layer
b = [np.random.random() for x in range(self.outputLayerSize)]
B = [b for x in range(self.numExamples)];
self.biases.append(np.mat(B))
# Weights (Parameters)
# Weight matrix between input and first layer
W = np.random.rand(self.inputLayerSize, self.hiddenLayerSize)
self.weights.append(W)
for i in range(self.numHiddenLayer-1):
# Weight matrices between hidden layers
W = np.random.rand(self.hiddenLayerSize, self.hiddenLayerSize)
self.weights.append(W)
# Weight matric between hiddenlayer and outputlayer
self.weights.append(np.random.rand(self.hiddenLayerSize, self.outputLayerSize))
def setBatchSize(self, numExamples):
# Changes the number of rows (examples) for biases
if (self.numExamples > numExamples):
self.biases = [b[:numExamples] for b in self.biases]
def sigmoid(self, z):
# Apply sigmoid activation function
return 1/(1+np.exp(-z))
def sigmoidPrime(self, z):
# Derivative of sigmoid function
return self.sigmoid(x)*(1-self.sigmoid(z))
def ReLU(self, z):
# Apply activation function
'''
for (i, j), item in np.ndenumerate(z):
if (item < 0):
item *= 0.01
else:
item = item
return z'''
return np.multiply((z < 0), z * 0.01) + np.multiply((z >= 0), z)
def ReLUPrime(self, z):
# Derivative of ReLU activation function\
'''
for (i, j), item in np.ndenumerate(z):
if (item < 0):
item = 0.01
else:
item = 1
return z'''
return (z < 0) * 0.01 + (z >= 0) * 1
def forward(self, X):
# Propagate outputs through network
self.z = []
self.a = []
self.z.append(np.dot(X, self.weights[0]) + self.biases[0])
self.a.append(self.ReLU(self.z[0]))
#viewZ = self.z
#viewA = self.a
for i in range(1, self.numHiddenLayer):
self.z.append(np.dot(self.a[-1], self.weights[i]) + self.biases[i])
self.a.append(self.ReLU(self.z[-1]))
self.z.append(np.dot(self.z[-1], self.weights[-1]) + self.biases[-1])
self.a.append(self.ReLU(self.z[-1]))
yHat = self.ReLU(self.z[-1])
return yHat
def backProp(self, X, y):
# Compute derivative wrt W
# out -> in
dJdW = [] # stores matrices of each dJdW (equal in size to self.weights[])
delta = [] # stores matrices of each backpropagating error
self.yHat = self.forward(X)
# Quantifying Error
J = np.multiply((y-self.yHat),(y-self.yHat)) * 0.5
Javrg = np.dot(J.T, np.mat([1 for x in range(self.numExamples)]).reshape(self.numExamples, 1))
print(Javrg.item(0))
delta.insert(0,np.multiply(-(y-self.yHat), self.ReLUPrime(self.z[-1]))) # delta = (y-yHat)(sigmoidPrime(final layer unactivated))
dJdW.insert(0, np.dot(self.a[-2].T, delta[0]) + (self.weightDecay*self.weights[-1])) # dJdW
for i in range(len(self.weights)-1, 1, -1):
# Iterate from self.weights[-1] -> self.weights[1]
delta.insert(0, np.multiply(np.dot(delta[0], self.weights[i].T), self.ReLUPrime(self.z[i-1])))
dJdW.insert(0, np.dot(self.a[i-2].T, delta[0]) + (self.weightDecay*self.weights[i-1]))
delta.insert(0, np.multiply(np.dot(delta[0], self.weights[1].T), self.ReLUPrime(self.z[0])))
dJdW.insert(0, np.dot(X.T, delta[0]) + (self.weightDecay*self.weights[0]))
return dJdW
def train(self, X, y):
for t in range(60000):
dJdW = self.backProp(X, y)
for i in range(len(dJdW)):
self.weights[i] -= self.learningRate*dJdW[i]
# Instantiating Neural Network
inputs = [int(np.random.randint(0,1000)) for x in range(1000)]
x = np.mat([x for x in inputs]).reshape(1000,1)
y = np.mat([x+1 for x in inputs]).reshape(1000,1)
NN = Neural_Network(1,3,1,1,1000)
# Training
print("INPUT: ", end = '\n')
print(x, end = '\n\n')
print("BEFORE TRAINING", NN.forward(x), sep = '\n', end = '\n\n')
print("ERROR: ")
NN.train(x,y)
print("\nAFTER TRAINING", NN.forward(x), sep = '\n', end = '\n\n')
# Testing
test = np.mat([int(np.random.randint(0,10080)) for x in range(1000)]).reshape(1000,1)
print("TEST INPUT:", test, sep = '\n', end = '\n\n')
print(NN.forward(test), end = '\n\n')
NN.setBatchSize(1) # changing settings to receive one input at a time
while True:
# Give numbers between 0-100 (I need to fix overfitting) and it will get next value
inputs = input()
x = np.mat([int(i) for i in inputs.split(" ")])
print(NN.forward(x))
I first made the ANN using sigmoid but Leaky ReLU is faster.
The code is a bit much so here is a summary:
Neural Network Class
define hyperparameter and stuff (include really small learning rate scalar)
activation functions and their derivatives (ReLU and sigmoid)
Member functions: forward propagation, backpropagation, setBatchSize etc.
Instantiating ANN
setting hyperparameters (topology of ANN)
creating data (one array has values x and the output array has values x+1)
Training
using inputs generated in step 2 to train ANN
Testing
Testing using randomly generated inputs
User can give inputs
Hope that helps you help me. Thanks!
Your ReLU and ReLUPrime are wrong. When you iterate over a collection and mutate items it doesn't change the collection. Also: try to not explicitly iterate over arrays in numpy, but use vectorized operations, because they are way faster. It should be a good exercise to rewrite ReLU and its derivative in vectorized form. If you aren't sure what I mean, check out this answer.
Apart from that sigmoidPrime is wrong, it should be
self.sigmoid(z) * (1-self.sigmoid(z))
PS
This problem isn't really well suited for neural network, at least not for this encoding - I've tried it with exact hyperparameters with scikit-learn MLPRegressor and its output doesn't make much sense.

Neural Network MNIST: Backpropagation is correct, but training/test accuracy very low

I am building a neural network to learn to recognize handwritten digits from MNIST. I have confirmed that backpropagation calculates the gradients perfectly (gradient checking gives error < 10 ^ -10).
It appears that no matter how I train the weights, the cost function always tends towards around 3.24-3.25 (never below that, just approaching from above) and the training/test set accuracy is very low (around 11% for the test set). It appears that the h values in the end are all very close to 0.1 and to each other.
I cannot find why my program cannot produce better results. I was wondering if anyone could maybe take a look at my code and please tell me any reasons for this occurring. Thank you so much for all your help, I really appreciate it!
Here is my Python code:
import numpy as np
import math
from tensorflow.examples.tutorials.mnist import input_data
# Neural network has four layers
# The input layer has 784 nodes
# The two hidden layers each have 5 nodes
# The output layer has 10 nodes
num_layer = 4
num_node = [784,5,5,10]
num_output_node = 10
# 30000 training sets are used
# 10000 test sets are used
# Can be adjusted
Ntrain = 30000
Ntest = 10000
# Sigmoid Function
def g(X):
return 1/(1 + np.exp(-X))
# Forwardpropagation
def h(W,X):
a = X
for l in range(num_layer - 1):
a = np.insert(a,0,1)
z = np.dot(a,W[l])
a = g(z)
return a
# Cost Function
def J(y, W, X, Lambda):
cost = 0
for i in range(Ntrain):
H = h(W,X[i])
for k in range(num_output_node):
cost = cost + y[i][k] * math.log(H[k]) + (1-y[i][k]) * math.log(1-H[k])
regularization = 0
for l in range(num_layer - 1):
for i in range(num_node[l]):
for j in range(num_node[l+1]):
regularization = regularization + W[l][i+1][j] ** 2
return (-1/Ntrain * cost + Lambda / (2*Ntrain) * regularization)
# Backpropagation - confirmed to be correct
# Algorithm based on https://www.coursera.org/learn/machine-learning/lecture/1z9WW/backpropagation-algorithm
# Returns D, the value of the gradient
def BackPropagation(y, W, X, Lambda):
delta = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
delta[l] = np.zeros((num_node[l]+1,num_node[l+1]))
for i in range(Ntrain):
A = np.empty(num_layer-1, dtype = object)
a = X[i]
for l in range(num_layer - 1):
A[l] = a
a = np.insert(a,0,1)
z = np.dot(a,W[l])
a = g(z)
diff = a - y[i]
delta[num_layer-2] = delta[num_layer-2] + np.outer(np.insert(A[num_layer-2],0,1),diff)
for l in range(num_layer-2):
index = num_layer-2-l
diff = np.multiply(np.dot(np.array([W[index][k+1] for k in range(num_node[index])]), diff), np.multiply(A[index], 1-A[index]))
delta[index-1] = delta[index-1] + np.outer(np.insert(A[index-1],0,1),diff)
D = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
D[l] = np.zeros((num_node[l]+1,num_node[l+1]))
for l in range(num_layer-1):
for i in range(num_node[l]+1):
if i == 0:
for j in range(num_node[l+1]):
D[l][i][j] = 1/Ntrain * delta[l][i][j]
else:
for j in range(num_node[l+1]):
D[l][i][j] = 1/Ntrain * (delta[l][i][j] + Lambda * W[l][i][j])
return D
# Neural network - this is where the learning/adjusting of weights occur
# W is the weights
# learn is the learning rate
# iterations is the number of iterations we pass over the training set
# Lambda is the regularization parameter
def NeuralNetwork(y, X, learn, iterations, Lambda):
W = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
W[l] = np.random.rand(num_node[l]+1,num_node[l+1])/100
for k in range(iterations):
print(J(y, W, X, Lambda))
D = BackPropagation(y, W, X, Lambda)
for l in range(num_layer-1):
W[l] = W[l] - learn * D[l]
print(J(y, W, X, Lambda))
return W
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Training data, read from MNIST
inputpix = []
output = []
for i in range(Ntrain):
inputpix.append(2 * np.array(mnist.train.images[i]) - 1)
output.append(np.array(mnist.train.labels[i]))
np.savetxt('input.txt', inputpix, delimiter=' ')
np.savetxt('output.txt', output, delimiter=' ')
# Train the weights
finalweights = NeuralNetwork(output, inputpix, 2, 5, 1)
# Test data
inputtestpix = []
outputtest = []
for i in range(Ntest):
inputtestpix.append(2 * np.array(mnist.test.images[i]) - 1)
outputtest.append(np.array(mnist.test.labels[i]))
np.savetxt('inputtest.txt', inputtestpix, delimiter=' ')
np.savetxt('outputtest.txt', outputtest, delimiter=' ')
# Determine the accuracy of the training data
count = 0
for i in range(Ntrain):
H = h(finalweights,inputpix[i])
print(H)
for j in range(num_output_node):
if H[j] == np.amax(H) and output[i][j] == 1:
count = count + 1
print(count/Ntrain)
# Determine the accuracy of the test data
count = 0
for i in range(Ntest):
H = h(finalweights,inputtestpix[i])
print(H)
for j in range(num_output_node):
if H[j] == np.amax(H) and outputtest[i][j] == 1:
count = count + 1
print(count/Ntest)
Your network is tiny, 5 neurons make it basically a linear model. Increase it to 256 per layer.
Notice, that trivial linear model has 768 * 10 + 10 (biases) parameters, adding up to 7690 floats. Your neural network on the other hand has 768 * 5 + 5 + 5 * 5 + 5 + 5 * 10 + 10 = 3845 + 30 + 60 = 3935. In other words despite being nonlinear neural network, it is actualy a simpler model than a trivial logistic regression applied to this problem. And logistic regression obtains around 11% error on its own, thus you cannot really expect to beat it. Of course this is not a strict argument, but should give you some intuition for why it should not work.
Second issue is related to other hyperparameters, you seem to be using:
huge learning rate (is it 2?) it should be more of order 0.0001
very little training iterations (are you just executing 5 epochs?)
your regularization parameter is huge (it is set to 1), so your network is heavily penalised for learning anything, again - change it to something order of magnitude smaller
The NN architecture is most likely under-fitting. Maybe, the learning rate is high/low. Or there are most issues with the regularization parameter.

Back-propagation algorithm converging too quickly to poor results

I'm trying to implement the back propagation algorithm for a multi layer feedforward neural network, but I'm having issues getting it to converge to good results. The reason being, the gradient descent gets stuck on a plate of the root mean squared error.
As you can see in the graph, there is very little change in the rms value for the first 70 epochs or so. Therefore the gradient descent things its found a minimum and stops. To fix this I set a requirement that the rms error must be below 0.3 in addition to the rate of change being below a given value. However, I don't think this is good as I believe there is something wrong with my implementation.
Below is the ruby code:
def train eta, criteria
rms = 1
old_rms = 0
rms_window = Array.new 20, 0
new_avg = 10
old_avg = 0
diff = 100
epoch = 0
#data[:training].shuffle!
while (diff > criteria || rms > 0.3) do
#while (diff > criteria) do
rms = 0
old_avg = new_avg
new_avg = 0
classification_error = 0
sample_num = 0
#data[:training].each_with_index do |s, s_i|
# Forward Propagation
inputs = [1, s[1], s[2]]
#hidden_layers.each_with_index do |hl, hl_i|
outputs = Array.new
# Bias Term
outputs << 1
# Compute the output for each neuron
hl.each do |p|
outputs << p.compute_output(inputs)
end
inputs = outputs
end
# Compute System Outputs
outputs = Array.new
#outputs.each do |p|
outputs << p.compute_output(inputs)
end
# Comput Errors
errors = Array.new
desired = #desired_values[s[0]-1]
#outputs.length.times do |x|
errors[x] = desired[x] - outputs[x]
rms += errors[x]**2
end
decision = outputs.each_with_index.max[1]
if decision+1 != s[0]
classification_error += 1
end
# Back Propagation
gradients = Array.new
local_gradient = Array.new
next_layer = Array.new
#outputs.each_with_index do |o, i|
local_gradient << errors[i] * o.activation_prime(o.output)
o.weights.length.times do |x|
o.weights[x] += eta * local_gradient[i] * o.inputs[x]
end
end
gradients << local_gradient
next_layer = #outputs
#hidden_layers.reverse_each do |hl|
local_gradient = Array.new
hl.each do |p|
gradient = 0
gradients.last.each_with_index do |g, i|
gradient += g * next_layer[i].weights[p.index+1]
end
gradient *= p.activation_prime(p.output)
local_gradient << gradient
p.weights.each_index do |x|
p.weights[x] += eta * gradient * p.inputs[x]
end
end
gradients << local_gradient
next_layer = hl
end
if s_i == 0
#puts "Epoch: #{epoch}\nOutputs: #{outputs}\nGradients:\n#{gradients[0]}\n#{gradients[1]}\n#{gradients[2]}\n\n"
#puts "Epoch #{epoch}\nError: #{errors}\nSE: #{rms}"
end
end
rms = Math::sqrt(rms / (#data[:training].length * 4))
rms_window[0] = rms
rms_window.rotate!
rms_window.each do |x|
new_avg += x
end
new_avg /= 20
diff = (new_avg - old_avg).abs
#rms << rms
epoch += 1
if classification_error == 0
break
end
#puts "RMS: #{rms}\tDiff: \t#{diff}\tClassification: #{classification_error}\n\n"
end
self.rms_plot "Plot"
self.grid_eval "Test", 250
end
The graph shown is for a 2-hidden layer network with 5 neurons in each hidden layer. There are 2 inputs and 4 outputs. Perhaps this is normal behavior, but something just seems off to me. Any help would be greatly appreciated.
There are many parameters that need to be tuned to get a multi-layer neural net to work. Based on my experiment, my first suggestions are:
1- give it a small set of synthesized data and run a baby project to see if the framework works.
2- Use a more convex cost function. There is no function that guarantees convexity, but there are many functions that are more convex that RMS.
3- Try scaling your input data in (-1,1) and output data in (0,1).
4- Try different values for learning rate.
In addition to whats already been said:
vary the range of the initial weights a little more (0 - 1 for example)
make sure your input data are properly normalised - i fell this can't be said often enough
vary the learning rate, start with sth like 0.05 and keep increasing/decreasing in small steps (if you find that changing your learning rate has a too extreme effect on the network's performance, then you may haven't normalised your input data appropriately)
shuffle the input data before every epoch
try using momentum (this essentially means, increase the learning rate while the gradient is steep, decrease if it becomes flatter), this often helps to jump over local optima
try using regularisation
experiment with the structure (add another hidden layer, increase the number of units in the hidden layer)

How to derive an objective function for a multi-class logistic regression classifier using 1-of-k encoding?

I get what this wiki page says(http://en.wikipedia.org/wiki/Multinomial_logistic_regression), but I don't know how to get the update rules for stochastic gradient descent. Sorry to ask this here(this is really just about machine learning theories instead of actual implementation). Could someone provide a solution with explanation? Thanks in advance!
I happened to write code to implent softmax, I refer most to the page http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression
this is the code I wrote in matlab ,hope it will help
function y = sigmoid_multi(weight,x,class_index)
%% weight feature_dim * class_num
%% x feature_dim * 1
%% class_index scalar
sum = eps;
class_num = size(weight,2);
for i = 1:class_num
sum = sum + exp(weight(:,i)'*x);
end
y = exp(weight(:,class_index)'*x)/sum;
end
function g = gradient(train_patterns,train_labels,weight)
m = size(train_patterns,2);
class_num = size(weight,2);
g = zeros(size(weight));
for j = 1:class_num
for i = 1:m
if(train_labels(i) == j)
g(:,j) = g(:,j) + (1 - log( sigmoid_multi(weight,train_patterns(:,i),j) + eps))*train_patterns(:,i);
end
end
end
g = -(g/m);
end
function J = object_function(train_patterns,train_labels,weight)
m = size(train_patterns,2);
J = 0;
for i = 1:m
J = J + log( sigmoid_multi(weight,train_patterns(:,i),train_labels(i)) + eps);
end
J = -(J/m);
end
function weight = multi_logistic_train(train_patterns,train_labels,alpha)
%% weight feature_dim * class_num
%% train_patterns featur_dim * sample_num
%% train_labels 1 * sample_num
%% alpha scalar
class_num = length(unique(train_labels));
m = size(train_patterns,2); %% sample_number;
n = size(train_patterns,1); % feature_dim;
weight = rand(n,class_num);
for i = 1:40
J = object_function(train_patterns,train_labels,weight);
fprintf('objec function value : %f\n',J);
weight = weight - alpha*gradient(train_patterns,train_labels,weight);
end
end

Resources