I've been trying to replicate the [2,2,1] neural network that learns the XOR gate, and I can't get my model to converge. I'm not sure where i'm going wrong, and I would really appreciate some feedback.
Here is my code for the class originally run in a Jupyter notebook:
# Define activation function and derivative
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def dsigmoid(x):
return sigmoid(x) * (1 - sigmoid(x))
# Define loss function and derivative
def mse(targets, predictions):
return (1 / (2 * len(targets))) * (targets - predictions) ** 2
def dmse(targets, predictions):
return targets - predictions
# Feedforward function
def feedforward(X):
z1 = np.dot(l1_weights, X.T) + l1_biases
a1 = sigmoid(z1)
z2 = np.dot(l2_weights, a1) + l2_biases
a2 = sigmoid(z2)
return z1, a1, z2, a2
# Backpropogation function
def backprop(x, y):
z1, a1, z2, a2 = feedforward(x)
delta_l2 = dmse(y.T, a2) * dsigmoid(z2)
delta_l1 = np.dot(l2_weights.T, delta_l2) * dsigmoid(z1)
l2_dw = np.dot(delta_l2, a1.T)
l2_db = delta_l2
l1_dw = np.dot(delta_l1, x)
l1_db = delta_l1
return l1_dw, l1_db, l2_dw, l2_db
# Input data and labels
X = np.array([[1,0],[0,1],[1,1],[0,0]])
Y = np.array([[1],[1],[0],[0]])
# Create the data set
data = [(np.array(x), np.array(y)) for x, y in zip(X, Y)]
# Randomly initialize weights and biases
np.random.seed(1)
l1_weights = np.random.randn(2,2)
l2_weights = np.random.randn(1,2)
l1_biases = np.random.randn(2,1)
l2_biases = np.random.randn(1,1)
# Train the model
epochs = 100000
eta = 0.05
batch_size = 4
# Batch the data
batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
for batch in batches:
feature_batch = np.array([x[0] for x in batch])
label_batch = np.array([x[1] for x in batch])
# Update network weights with stochastic gradient descent
l1_dw, l1_db, l2_dw, l2_db = backprop(feature_batch, label_batch)
l1_weights = l1_weights - (eta / batch_size) * l1_dw
l2_weights = l2_weights - (eta / batch_size) * l2_dw
l1_biases = l1_biases - (eta / batch_size) * l1_db
l2_biases = l2_biases - (eta / batch_size) * l2_db
Here is a sample output after training with:
training epochs = 100000
learning rate = 0.5
logging periods = 10
batch size = 4
Error for period 1: 0.135619
Error for period 2: 0.249879
Error for period 3: 0.249941
Error for period 4: 0.249961
Error for period 5: 0.249956
Error for period 6: 0.249963
Error for period 7: 0.249972
Error for period 8: 0.249983
Error for period 9: 0.249986
Error for period 10: 0.249981
# OUTPUT
print(feedforward(X)[-1])
>>> array([[0.99997653, 0.99995257, 0.99997791, 0.99995534]])
Please help!
Related
I have created a very simple neural network to solve the following ODE:
My code doesn't seem to work well, which is pretty weird since the ODE is simple. Below is my code with comments. I believe the problem is coming from torch.gradient(preds) but I can't figure it out. Any help would be greatly appreciated!
inputs = np.linspace(0,2,50,dtype='float32')
inputs = torch.from_numpy(inputs)
inputs.requires_grad = True
inputs.float()
# Initial Condition
A = 0.0
# Known trial solution for general ODE
trial_solution = lambda I: A + I * NN(I)
ODE = lambda x: x
# Numbers of neurons from layers 1 to 3
neuron_1 = 9
neuron_2 = 12
neuron_3 = 8
# Initializing the weights and biases
W1 = torch.randn(len(inputs), neuron_1, requires_grad=True).float()
B1 = torch.randn(neuron_1, requires_grad=True).float()
W2 = torch.randn(neuron_1, neuron_2, requires_grad=True)
B2 = torch.randn(neuron_2, requires_grad=True)
W3 = torch.randn(neuron_2, neuron_3, requires_grad=True)
B3 = torch.randn(neuron_3, requires_grad=True)
W4 = torch.randn(neuron_3, len(inputs), requires_grad=True)
B4 = torch.randn(len(inputs), requires_grad=True)
# General sigmoid function used for neurons inside hidden layers
def Sigmoid(z):
return 1/(1+torch.exp(-z))
# Defining the structure of my Neural Network
def NN(x):
z1 = W1.t() # x + B1
z2 = Sigmoid(z1)
z3 = W2.t() # z2 + B2
z4 = Sigmoid(z3)
z5 = W3.t() # z4 + B3
z6 = Sigmoid(z5)
return W4.t() # z6 + B4
# Defining the mean squared error to be my loss function
def mse(t1,t2):
diff = t1-t2
return torch.sum(diff*diff)/diff.numel()
# Iterating over number of epochs for backprop.
for i in range(10000):
preds = trial_solution(inputs)
preds_gradient = torch.gradient(preds) # Possible problem!!
preds_gradient =preds_gradient[0]
target_gradient = ODE(inputs)
loss = mse(preds_gradient,target_gradient)
loss.backward()
#print(loss)
with torch.no_grad():
W1 -= W1.grad*1e-2
B1 -= B1.grad*1e-2
W1.grad.zero_()
B1.grad.zero_()
W2 -= W2.grad*1e-2
B2 -= B2.grad*1e-2
W2.grad.zero_()
B2.grad.zero_()
W3 -= W3.grad*1e-2
B3 -= B3.grad*1e-2
W3.grad.zero_()
B3.grad.zero_()
W4 -= W4.grad*1e-2
B4 -= B4.grad*1e-2
W4.grad.zero_()
B4.grad.zero_()
numpy_tensors = np.linspace(0,2,50,dtype='float32')
numpy_tensors = torch.from_numpy(numpy_tensors)
final_pred = trial_solution(inputs).detach().numpy()
xx = np.linspace(0,2,50)
yt = xx*xx * 0.5
plt.plot(numpy_tensors,final_pred, label='Prediction from NN')
plt.plot(xx,yt,label = 'True Solution')
plt.legend()
I don't understand. When I hardcode my script, it converges excellent, but in the softcode version, given the same structure and learning rate, it converges very slowly and then simply stops converging from some point on.
Here is the softcode version:
def BCE_loss(Y_hat, Y):
m = Y_hat.shape[1]
cost = (-1 / m) * (np.dot(Y, np.log(Y_hat+1e-5).T) + np.dot(1-Y, np.log(1-Y_hat+1e-5).T))
cost = np.squeeze(cost)
return cost
def BCE_loss_backward(Y_hat, Y):
dA_prev = - (np.divide(Y, Y_hat) - np.divide(1-Y, 1-Y_hat))
return dA_prev
def gradient(dZ, A_prev):
dW = np.dot(dZ, A_prev.T) * (1 / A_prev.shape[1])
db = np.sum(dZ, axis=1, keepdims=True) * (1 / A_prev.shape[1])
return dW, db
def update(W, b, dW, db, learning_rate):
W -= np.dot(learning_rate, dW)
b -= np.dot(learning_rate, db)
return W, b
for i in range(epochs+1):
## Forward pass
for l in range(1, L):
if l==L-1:
if out_dim==1:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = sigmoid(grads_GD['Z'+str(l)])
else:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = softmax(grads_GD['Z'+str(l)])
else:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = relu(grads_GD['Z'+str(l)])
## Compute cost
if out_dim==1:
cost_GD = BCE_loss(grads_GD['A'+str(L-1)], Y)
cost_list_GD.append(cost_GD)
else:
cost_GD = CE_loss(grads_GD['A'+str(L-1)], Y)
cost_list_GD.append(cost_GD)
## Print cost
if i % print_num == 0:
print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
elif cost_GD < cost_lim or i == epochs:
last_epoch_GD = i
print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
break
else:
continue
## Backward pass
if out_dim==1:
grads_GD['dA'+str(L-1)] = BCE_loss_backward(grads_GD['A'+str(L-1)], Y)
grads_GD['dZ'+str(L-1)] = sigmoid_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
else:
grads_GD['dA'+str(L-1)] = CE_loss_backward(grads_GD['A'+str(L-1)], Y)
grads_GD['dZ'+str(L-1)] = softmax_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
grads_GD['dW'+str(L-1)], grads_GD['db'+str(L-1)] = gradient(grads_GD['dZ'+str(L-1)], grads_GD['A'+str(L-2)])
for l in reversed(range(1, L-1)):
grads_GD['dA'+str(l)] = linear_backward(params_GD['W'+str(l+1)], grads_GD['dZ'+str(l+1)])
grads_GD['dZ'+str(l)] = relu_backward(grads_GD['dA'+str(l)], grads_GD['Z'+str(l)])
grads_GD['dW'+str(l)], grads_GD['db'+str(l)] = gradient(grads_GD['dZ'+str(l)], grads_GD['A'+str(l-1)])
## Update parameters
for l in range(1, L):
params_GD['W'+str(l)], params_GD['b'+str(l)] = update(params_GD['W'+str(l)], params_GD['b'+str(l)], grads_GD['dW'+str(l)], grads_GD['db'+str(l)], learning_rate)
and here is the hardcode version:
def cost_function(Y, A4, N, epsilon):
cost = (-1 / N) * np.sum(np.multiply(Y, np.log(A4 + epsilon)) + np.multiply(1 - Y, np.log(1 - A4 + epsilon)))
return cost
for i in range(epochs):
Z1_GD = np.dot(W1_GD, X) + b1_GD
A1_GD = np.maximum(0, Z1_GD)
Z2_GD = np.dot(W2_GD, A1_GD) + b2_GD
A2_GD = np.maximum(0, Z2_GD)
Z3_GD = np.dot(W3_GD, A2_GD) + b3_GD
A3_GD = np.maximum(0, Z3_GD)
Z4_GD = np.dot(W4_GD, A3_GD) + b4_GD
A4_GD = class_layer(Z4_GD)
dZ4_GD = A4_GD - Y
dW4_GD = np.dot(dZ4_GD, A3_GD.T) * (1. / A3_GD.shape[1])
db4_GD = np.sum(dZ4_GD, axis=1, keepdims=True) * (1. / A3_GD.shape[1])
dA3_GD = np.dot(W4_GD.T, dZ4_GD)
dZ3_GD = np.array(dA3_GD, copy=True)
dZ3_GD[Z3_GD <= 0] = 0
dW3_GD = np.dot(dZ3_GD, A2_GD.T) * (1. / A2_GD.shape[1])
db3_GD = np.sum(dZ3_GD, axis=1, keepdims=True) * (1. / A2_GD.shape[1])
dA2_GD = np.dot(W3_GD.T, dZ3_GD)
dZ2_GD = np.array(dA2_GD, copy=True)
dZ2_GD[Z2_GD <= 0] = 0
dW2_GD = np.dot(dZ2_GD, A1_GD.T) * (1. / A1_GD.shape[1])
db2_GD = np.sum(dZ2_GD, axis=1, keepdims=True) * (1. / A1_GD.shape[1])
dA1_GD = np.dot(W2_GD.T, dZ2_GD)
dZ1_GD = np.array(dA1_GD, copy=True)
dZ1_GD[Z1_GD <= 0] = 0
dW1_GD = np.dot(dZ1_GD, X.T) * (1. / X.shape[1])
db1_GD = np.sum(dZ1_GD, axis=1, keepdims=True) * (1. / X.shape[1])
W1_GD = W1_GD - learning_rate * dW1_GD
b1_GD = b1_GD - learning_rate * db1_GD
W2_GD = W2_GD - learning_rate * dW2_GD
b2_GD = b2_GD - learning_rate * db2_GD
W3_GD = W3_GD - learning_rate * dW3_GD
b3_GD = b3_GD - learning_rate * db3_GD
W4_GD = W4_GD - learning_rate * dW4_GD
b4_GD = b4_GD - learning_rate * db4_GD
cost_GD = cost_function(Y, A4_GD, N, epsilon)
cost_GD = np.squeeze(cost_GD)
cost_list_GD.append(cost_GD)
I suppose something went wrong during softcoding.
I solved it myself. Apparently, the "else: continue" line in the print cost section caused the algorithm to do a backward pass only once. After that, it was just looping through the forward pass. Can anyone please explain the reason for such behavior?
I wrote a RNN with LSTM cell with Pycharm. The peculiarity of this network is that the output of the RNN is fed into a integration opeartion, computed with Runge-kutta.
The integration takes some input and propagate that in time one step ahead. In order to do so I need to slice the feature tensor X along the batch dimension, and pass this to the Runge-kutta.
class MyLSTM(torch.nn.Module):
def __init__(self, ni, no, sampling_interval, nh=10, nlayers=1):
super(MyLSTM, self).__init__()
self.device = torch.device("cpu")
self.dtype = torch.float
self.ni = ni
self.no = no
self.nh = nh
self.nlayers = nlayers
self.lstms = torch.nn.ModuleList(
[torch.nn.LSTMCell(self.ni, self.nh)] + [torch.nn.LSTMCell(self.nh, self.nh) for i in range(nlayers - 1)])
self.out = torch.nn.Linear(self.nh, self.no)
self.do = torch.nn.Dropout(p=0.2)
self.actfn = torch.nn.Sigmoid()
self.sampling_interval = sampling_interval
self.scaler_states = None
# Options
# description of the whole block
def forward(self, x, h0, train=False, integrate_ode=True):
x0 = x.clone().requires_grad_(True)
hs = x # initiate hidden state
if h0 is None:
h = torch.zeros(hs.shape[0], self.nh, device=self.device)
c = torch.zeros(hs.shape[0], self.nh, device=self.device)
else:
(h, c) = h0
# LSTM cells
for i in range(self.nlayers):
h, c = self.lstms[i](hs, (h, c))
if train:
hs = self.do(h)
else:
hs = h
# Output layer
# y = self.actfn(self.out(hs))
y = self.out(hs)
if integrate_ode:
p = y
y = self.integrate(x0, p)
return y, (h, c)
def integrate(self, x0, p):
# RK4 steps per interval
M = 4
DT = self.sampling_interval / M
X = x0
# X = self.scaler_features.inverse_transform(x0)
for b in range(X.shape[0]):
xx = X[b, :]
for j in range(M):
k1 = self.ode(xx, p[b, :])
k2 = self.ode(xx + DT / 2 * k1, p[b, :])
k3 = self.ode(xx + DT / 2 * k2, p[b, :])
k4 = self.ode(xx + DT * k3, p[b, :])
xx = xx + DT / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
X_all[b, :] = xx
return X_all
def ode(self, x0, y):
# Here I a dynamic model
I get this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of SelectBackward, is at version 64; expected version 63 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
the problem is in the operations xx = X[b, :] and p[b,:]. I know that because I choose batch dimension of 1, then I can replace the previous two equations with xx=X and p, and this works. How can split the tensor without loosing the gradient?
I had the same question, and after a lot of searching, I added .detach() function after "h" and "c" in the RNN cell.
I am building a neural network to learn to recognize handwritten digits from MNIST. I have confirmed that backpropagation calculates the gradients perfectly (gradient checking gives error < 10 ^ -10).
It appears that no matter how I train the weights, the cost function always tends towards around 3.24-3.25 (never below that, just approaching from above) and the training/test set accuracy is very low (around 11% for the test set). It appears that the h values in the end are all very close to 0.1 and to each other.
I cannot find why my program cannot produce better results. I was wondering if anyone could maybe take a look at my code and please tell me any reasons for this occurring. Thank you so much for all your help, I really appreciate it!
Here is my Python code:
import numpy as np
import math
from tensorflow.examples.tutorials.mnist import input_data
# Neural network has four layers
# The input layer has 784 nodes
# The two hidden layers each have 5 nodes
# The output layer has 10 nodes
num_layer = 4
num_node = [784,5,5,10]
num_output_node = 10
# 30000 training sets are used
# 10000 test sets are used
# Can be adjusted
Ntrain = 30000
Ntest = 10000
# Sigmoid Function
def g(X):
return 1/(1 + np.exp(-X))
# Forwardpropagation
def h(W,X):
a = X
for l in range(num_layer - 1):
a = np.insert(a,0,1)
z = np.dot(a,W[l])
a = g(z)
return a
# Cost Function
def J(y, W, X, Lambda):
cost = 0
for i in range(Ntrain):
H = h(W,X[i])
for k in range(num_output_node):
cost = cost + y[i][k] * math.log(H[k]) + (1-y[i][k]) * math.log(1-H[k])
regularization = 0
for l in range(num_layer - 1):
for i in range(num_node[l]):
for j in range(num_node[l+1]):
regularization = regularization + W[l][i+1][j] ** 2
return (-1/Ntrain * cost + Lambda / (2*Ntrain) * regularization)
# Backpropagation - confirmed to be correct
# Algorithm based on https://www.coursera.org/learn/machine-learning/lecture/1z9WW/backpropagation-algorithm
# Returns D, the value of the gradient
def BackPropagation(y, W, X, Lambda):
delta = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
delta[l] = np.zeros((num_node[l]+1,num_node[l+1]))
for i in range(Ntrain):
A = np.empty(num_layer-1, dtype = object)
a = X[i]
for l in range(num_layer - 1):
A[l] = a
a = np.insert(a,0,1)
z = np.dot(a,W[l])
a = g(z)
diff = a - y[i]
delta[num_layer-2] = delta[num_layer-2] + np.outer(np.insert(A[num_layer-2],0,1),diff)
for l in range(num_layer-2):
index = num_layer-2-l
diff = np.multiply(np.dot(np.array([W[index][k+1] for k in range(num_node[index])]), diff), np.multiply(A[index], 1-A[index]))
delta[index-1] = delta[index-1] + np.outer(np.insert(A[index-1],0,1),diff)
D = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
D[l] = np.zeros((num_node[l]+1,num_node[l+1]))
for l in range(num_layer-1):
for i in range(num_node[l]+1):
if i == 0:
for j in range(num_node[l+1]):
D[l][i][j] = 1/Ntrain * delta[l][i][j]
else:
for j in range(num_node[l+1]):
D[l][i][j] = 1/Ntrain * (delta[l][i][j] + Lambda * W[l][i][j])
return D
# Neural network - this is where the learning/adjusting of weights occur
# W is the weights
# learn is the learning rate
# iterations is the number of iterations we pass over the training set
# Lambda is the regularization parameter
def NeuralNetwork(y, X, learn, iterations, Lambda):
W = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
W[l] = np.random.rand(num_node[l]+1,num_node[l+1])/100
for k in range(iterations):
print(J(y, W, X, Lambda))
D = BackPropagation(y, W, X, Lambda)
for l in range(num_layer-1):
W[l] = W[l] - learn * D[l]
print(J(y, W, X, Lambda))
return W
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Training data, read from MNIST
inputpix = []
output = []
for i in range(Ntrain):
inputpix.append(2 * np.array(mnist.train.images[i]) - 1)
output.append(np.array(mnist.train.labels[i]))
np.savetxt('input.txt', inputpix, delimiter=' ')
np.savetxt('output.txt', output, delimiter=' ')
# Train the weights
finalweights = NeuralNetwork(output, inputpix, 2, 5, 1)
# Test data
inputtestpix = []
outputtest = []
for i in range(Ntest):
inputtestpix.append(2 * np.array(mnist.test.images[i]) - 1)
outputtest.append(np.array(mnist.test.labels[i]))
np.savetxt('inputtest.txt', inputtestpix, delimiter=' ')
np.savetxt('outputtest.txt', outputtest, delimiter=' ')
# Determine the accuracy of the training data
count = 0
for i in range(Ntrain):
H = h(finalweights,inputpix[i])
print(H)
for j in range(num_output_node):
if H[j] == np.amax(H) and output[i][j] == 1:
count = count + 1
print(count/Ntrain)
# Determine the accuracy of the test data
count = 0
for i in range(Ntest):
H = h(finalweights,inputtestpix[i])
print(H)
for j in range(num_output_node):
if H[j] == np.amax(H) and outputtest[i][j] == 1:
count = count + 1
print(count/Ntest)
Your network is tiny, 5 neurons make it basically a linear model. Increase it to 256 per layer.
Notice, that trivial linear model has 768 * 10 + 10 (biases) parameters, adding up to 7690 floats. Your neural network on the other hand has 768 * 5 + 5 + 5 * 5 + 5 + 5 * 10 + 10 = 3845 + 30 + 60 = 3935. In other words despite being nonlinear neural network, it is actualy a simpler model than a trivial logistic regression applied to this problem. And logistic regression obtains around 11% error on its own, thus you cannot really expect to beat it. Of course this is not a strict argument, but should give you some intuition for why it should not work.
Second issue is related to other hyperparameters, you seem to be using:
huge learning rate (is it 2?) it should be more of order 0.0001
very little training iterations (are you just executing 5 epochs?)
your regularization parameter is huge (it is set to 1), so your network is heavily penalised for learning anything, again - change it to something order of magnitude smaller
The NN architecture is most likely under-fitting. Maybe, the learning rate is high/low. Or there are most issues with the regularization parameter.
So I am training a CNN and compute the training accuracy for each batch. Most of the it gives out 100% batch training accuracy. which I though was okay because I'm testing my model against the data I trained it with. But at some iterations, I get a 90% or 90% batch training accuracy. And worst, sometimes it goes down to 0% real quick and bounces back to 100% batch training accuracy. And I used the algorithm in https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/04_Save_Restore.ipynb and they also computed the batch training accuracy but they don't get the same results I get. They started out with around 80% batch training accuracy and observed a gradual increase until 98%. Why is this?
I was suspecting that my network is overfitting.
Here is my exact code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import pyfftw
from scipy import signal
import xlrd
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib
import time
from datetime import timedelta
import math
import os
from sklearn.metrics import confusion_matrix
##matplotlib inline
plt.style.use('ggplot')
## define funtions
def read_data(file_path):
## column_names = ['user-id','activity','timestamp', 'x-axis', 'y-axis', 'z-axis']
column_names = ['activity','timestamp', 'Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz', 'Mx', 'My', 'Mz'] ## 3 sensors
data = pd.read_csv(file_path,header = None, names = column_names)
return data
def feature_normalize(dataset):
mu = np.mean(dataset,axis = 0)
sigma = np.std(dataset,axis = 0)
return (dataset - mu)/sigma
def plot_axis(ax, x, y, title):
ax.plot(x, y)
ax.set_title(title)
ax.xaxis.set_visible(False)
ax.set_ylim([min(y) - np.std(y), max(y) + np.std(y)])
ax.set_xlim([min(x), max(x)])
ax.grid(True)
def plot_activity(activity,data):
fig, (ax0, ax1, ax2) = plt.subplots(nrows = 3, figsize = (15, 10), sharex = True)
plot_axis(ax0, data['timestamp'], data['Ax'], 'x-axis')
plot_axis(ax1, data['timestamp'], data['Ay'], 'y-axis')
plot_axis(ax2, data['timestamp'], data['Az'], 'z-axis')
plt.subplots_adjust(hspace=0.2)
fig.suptitle(activity)
plt.subplots_adjust(top=0.90)
plt.show()
def windows(data, size):
start = 0
while start < data.count():
yield start, start + size
start += (size / 2)
def segment_signal(data, window_size = None, num_channels=None): # edited
segments = np.empty((0,window_size,num_channels)) #change from 3 to 9 channels for AGM fusion #use variable num_channels=9
labels = np.empty((0))
for (n_start, n_end) in windows(data['timestamp'], window_size):
## x = data["x-axis"][start:end]
## y = data["y-axis"][start:end]
## z = data["z-axis"][start:end]
n_start = int(n_start)
n_end = int(n_end)
Ax = data["Ax"][n_start:n_end]
Ay = data["Ay"][n_start:n_end]
Az = data["Az"][n_start:n_end]
Gx = data["Gx"][n_start:n_end]
Gy = data["Gy"][n_start:n_end]
Gz = data["Gz"][n_start:n_end]
Mx = data["Mx"][n_start:n_end]
My = data["My"][n_start:n_end]
Mz = data["Mz"][n_start:n_end]
if(len(dataset['timestamp'][n_start:n_end]) == window_size): # include only windows with size of 90
segments = np.vstack([segments,np.dstack([Ax,Ay,Az,Gx,Gy,Gz,Mx,My,Mz])])
labels = np.append(labels,stats.mode(data["activity"][n_start:n_end])[0][0])
return segments, labels
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev = 0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.0, shape = shape)
return tf.Variable(initial)
def depthwise_conv2d(x, W):
return tf.nn.depthwise_conv2d(x,W, [1, 1, 1, 1], padding='VALID')
def apply_depthwise_conv(x,weights,biases):
return tf.nn.relu(tf.add(depthwise_conv2d(x, weights),biases))
def apply_max_pool(x,kernel_size,stride_size):
return tf.nn.max_pool(x, ksize=[1, 1, kernel_size, 1],
strides=[1, 1, stride_size, 1], padding='VALID')
#------------------------get dataset----------------------#
## run shoaib_dataset.py to generate dataset_shoaib_total.txt
## get data from dataset_shoaib_total.txt
dataset = read_data('dataset_shoaib_total.txt')
#--------------------preprocessing------------------------#
dataset['Ax'] = feature_normalize(dataset['Ax'])
dataset['Ay'] = feature_normalize(dataset['Ay'])
dataset['Az'] = feature_normalize(dataset['Az'])
dataset['Gx'] = feature_normalize(dataset['Gx'])
dataset['Gy'] = feature_normalize(dataset['Gy'])
dataset['Gz'] = feature_normalize(dataset['Gz'])
dataset['Mx'] = feature_normalize(dataset['Mx'])
dataset['My'] = feature_normalize(dataset['My'])
dataset['Mz'] = feature_normalize(dataset['Mz'])
###--------------------plot activity data----------------#
##for activity in np.unique(dataset["activity"]):
## subset = dataset[dataset["activity"] == activity][:180]
## plot_activity(activity,subset)
#------------------fixed hyperparameters--------------------#
window_size = 200 #from 90 #FIXED at 4 seconds
#----------------input hyperparameters------------------#
input_height = 1
input_width = window_size
num_labels = 6
num_channels = 9 #from 3 channels #9 channels for AGM
#-------------------sliding time window----------------#
segments, labels = segment_signal(dataset, window_size=window_size, num_channels=num_channels)
labels = np.asarray(pd.get_dummies(labels), dtype = np.int8)
reshaped_segments = segments.reshape(len(segments), (window_size*num_channels)) #use variable num_channels instead of constant 3 channels
#------------divide data into test and training set-----------#
train_test_split = np.random.rand(len(reshaped_segments)) < 0.80
train_x_init = reshaped_segments[train_test_split]
train_y_init = labels[train_test_split]
test_x = reshaped_segments[~train_test_split]
test_y = labels[~train_test_split]
train_validation_split = np.random.rand(len(train_x_init)) < 0.80
train_x = train_x_init[train_validation_split]
train_y = train_y_init[train_validation_split]
validation_x = train_x_init[~train_validation_split]
validation_y = train_y_init[~train_validation_split]
#---------------training hyperparameters----------------#
batch_size = 10
kernel_size = 60 #from 60 #optimal 2
depth = 15 #from 60 #optimal 15
num_hidden = 1000 #from 1000 #optimal 80
learning_rate = 0.0001
training_epochs = 8
total_batches = train_x.shape[0] ##// batch_size
#---------define placeholders for input----------#
X = tf.placeholder(tf.float32, shape=[None,input_width * num_channels], name="input")
X_reshaped = tf.reshape(X,[-1,input_height,input_width,num_channels])
Y = tf.placeholder(tf.float32, shape=[None,num_labels])
#---------------------perform convolution-----------------#
# first convolutional layer
c_weights = weight_variable([1, kernel_size, num_channels, depth])
c_biases = bias_variable([depth * num_channels])
c = apply_depthwise_conv(X_reshaped,c_weights,c_biases)
p = apply_max_pool(c,20,2)
# second convolutional layer
c2_weights = weight_variable([1, 6,depth*num_channels,depth//10])
c2_biases = bias_variable([(depth*num_channels)*(depth//10)])
c = apply_depthwise_conv(p,c2_weights,c2_biases)
#--------------flatten data for fully connected layers----------#
shape = c.get_shape().as_list()
c_flat = tf.reshape(c, [-1, shape[1] * shape[2] * shape[3]])
#------------fully connected layers----------------#
f_weights_l1 = weight_variable([shape[1] * shape[2] * depth * num_channels * (depth//10), num_hidden])
f_biases_l1 = bias_variable([num_hidden])
f = tf.nn.tanh(tf.add(tf.matmul(c_flat, f_weights_l1),f_biases_l1))
#----------------------dropout------------------#
keep_prob = tf.placeholder(tf.float32)
drop_layer = tf.nn.dropout(f, keep_prob)
#----------------------softmax layer----------------#
out_weights = weight_variable([num_hidden, num_labels])
out_biases = bias_variable([num_labels])
y_ = tf.nn.softmax(tf.add(tf.matmul(drop_layer, out_weights),out_biases), name="y_")
#-----------------loss optimization-------------#
loss = -tf.reduce_sum(Y * tf.log(y_))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
#-----------------compute accuracy---------------#
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
cost_history = np.empty(shape=[1],dtype=float)
saver = tf.train.Saver()
session = tf.Session()
session.run(tf.global_variables_initializer())
#-------------early stopping-----------------#
# Best validation accuracy seen so far.
best_validation_accuracy = 0.0
# Iteration-number for last improvement to validation accuracy.
last_improvement = 0
# Stop optimization if no improvement found in this many iterations.
require_improvement = 1000
# Counter for total number of iterations performed so far.
total_iterations = 0
def validation_accuracy():
return session.run(accuracy, feed_dict={X: validation_x, Y: validation_y, keep_prob: 1.0})
def next_batch(b, batch_size, train_x, train_y):
##for b in range(total_batches):
offset = (b * batch_size) % (train_y.shape[0] - batch_size)
batch_x = train_x[offset:(offset + batch_size), :]
batch_y = train_y[offset:(offset + batch_size), :]
return batch_x, batch_y
def optimize(num_iterations):
# Ensure we update the global variables rather than local copies.
global total_iterations
global best_validation_accuracy
global last_improvement
# Start-time used for printing time-usage below.
start_time = time.time()
for i in range(num_iterations):
# Increase the total number of iterations performed.
# It is easier to update it in each iteration because
# we need this number several times in the following.
total_iterations += 1
# Get a batch of training examples.
# x_batch now holds a batch of images and
# y_true_batch are the true labels for those images.
##x_batch, y_true_batch = data.train.next_batch(train_batch_size)
x_batch, y_true_batch = next_batch(i, batch_size, train_x, train_y)
# Put the batch into a dict with the proper names
# for placeholder variables in the TensorFlow graph.
feed_dict_train = {X: x_batch,
Y: y_true_batch, keep_prob: 0.5}
# Run the optimizer using this batch of training data.
# TensorFlow assigns the variables in feed_dict_train
# to the placeholder variables and then runs the optimizer.
session.run(optimizer, feed_dict=feed_dict_train)
# Print status every 100 iterations and after last iteration.
if (total_iterations % 100 == 0) or (i == (num_iterations - 1)):
# Calculate the accuracy on the training-batch.
acc_train = session.run(accuracy, feed_dict={X: x_batch,
Y: y_true_batch, keep_prob: 1.0})
# Calculate the accuracy on the validation-set.
# The function returns 2 values but we only need the first.
##acc_validation, _ = validation_accuracy()
acc_validation = validation_accuracy()
# If validation accuracy is an improvement over best-known.
if acc_validation > best_validation_accuracy:
# Update the best-known validation accuracy.
best_validation_accuracy = acc_validation
# Set the iteration for the last improvement to current.
last_improvement = total_iterations
# Save all variables of the TensorFlow graph to file.
saver.save(sess=session, save_path="../shoaib-har_agm_es.ckpt")
# A string to be printed below, shows improvement found.
improved_str = '*'
else:
# An empty string to be printed below.
# Shows that no improvement was found.
improved_str = ''
# Status-message for printing.
msg = "Iter: {0:>6}, Train-Batch Accuracy: {1:>6.1%}, Validation Acc: {2:>6.1%} {3}"
# Print it.
print(msg.format(i + 1, acc_train, acc_validation, improved_str))
# If no improvement found in the required number of iterations.
if total_iterations - last_improvement > require_improvement:
print("No improvement found in a while, stopping optimization.")
# Break out from the for-loop.
break
# Ending time.
end_time = time.time()
# Difference between start and end-times.
time_dif = end_time - start_time
# Print the time-usage.
print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))
optimize(10000)
With the output:
What exactly is training accuracy? Is it even computed? Or do you compute the training accuracy on the entire training data and not just the batch you trained your network with?
Here I printed the results such that it prints out the batch training accuracy and the training accuracy on the entire dataset set for every multiples of 20 iterations.
The data is divided to 3 sets: train, validation and test.
Batch training accuracy is computed on the train set (the difference between the label and the prediction).
Validation accuracy is the accuracy on the validation set.
The batch accuracy can be computed just after a forward pass in the network. The number of samples in one forward pass is the batch size. It is just a way to train models faster (mini-batch gradient descent)
Overfitting is when the model works really good on known data (training set) but performs poorly on new data.
As to the 10% multiples, it is just the printing format you are using.