Related
How can I avoid underfitting in Pytorch NeuralNetwork?
I try to predict the power consumtion of a plant based on seven features. I have built two simple neural network models.
The first one is a Linear model, and the second is a RNN model. However, both models perform bad in the test set, their forecast result is a straight line.
Something about data
There are about 360 samples in the CSV file. I take the first 300 samples for trainning and the others for test. The first 7 columns of raw data are features of daily operation. The last column is the electricity consumption of every day.
Setting of training set
In the linear model, train data is the first 7 colums of a certain day, and corresponding target is the power consumption of that day.
In the RNN model, train data is all the 8 columns of three days(seven features and power consumption), and corresponding traget is the power consumption of next three days.
Code
Code of RNN model
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
'''
build simple RNN
'''
batchSize = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
netPath = ''
'''Data processing'''
# read raw data
filePath = 'F:/.csv'
initialData = pd.read_csv(filePath)
print(initialData.head(10))
print('hello world')
# Separate features and power consumption.
trainDatas = initialData.iloc[0:7, 1:301]
trainPowerConsum = pd.DataFrame(initialData.iloc[-1, 1:301]).T
trainDatas = pd.concat([trainDatas, trainPowerConsum], 0)
trainPowerConsum = initialData.iloc[-1, 2:302]
# Plot
powerConsumPlot = trainDatas.iloc[-1, :]
xData = np.linspace(1, powerConsumPlot.shape[0], 300)
plt.plot(xData, powerConsumPlot)
plt.show()
testDatas = initialData.iloc[0:7, 302:-1]
testPowerConsum = pd.DataFrame(initialData.iloc[-1, 302:-1]).T
testDatas = pd.concat([testDatas, testPowerConsum], 0)
testPowerConsum = initialData.iloc[-1, 303:]
# convert to dataframe
trainDatas = pd.DataFrame(trainDatas)
trainDatas = trainDatas.T
trainPowerConsum = pd.DataFrame(trainPowerConsum)
testDatas = pd.DataFrame(testDatas)
testDatas = testDatas.T
testPowerConsum = pd.DataFrame(testPowerConsum)
# change the unit of PowerConsumption
trainDatas.iloc[:, -1] = trainDatas.iloc[:, -1] * 1000
testDatas.iloc[:, -1] = testDatas.iloc[:, -1] * 1000
trainPowerConsum.iloc[:, 0] = trainPowerConsum.iloc[:, 0] * 1000
testPowerConsum.iloc[:, 0] = testPowerConsum.iloc[:, 0] * 1000
assert testPowerConsum.shape[0] == testDatas.shape[0]
assert trainDatas.shape[0] == trainPowerConsum.shape[0]
# convert dataframe to tensor
trainDatas = torch.tensor(trainDatas.values.astype(float), device=device)
trainPowerConsum = torch.tensor(trainPowerConsum.values.astype(float), device=device)
testDatas = torch.tensor(testDatas.values.astype(float), device=device)
testPowerConsum = torch.tensor(testPowerConsum.values.astype(float), device=device)
trainDatasList = list()
trainPowerConsumList = list()
for i in range(298):
trainDatasList.append(trainDatas[i:i + 3])
trainPowerConsumList.append(trainPowerConsum[i:i + 3])
from torch.nn.utils.rnn import pad_sequence
trainPowerConsum = pad_sequence(trainPowerConsumList, batch_first=True)
trainDatas = pad_sequence(trainDatasList, batch_first=True)
print(trainDatas.shape)
# ensure the batch_size of test data is 1
testDatas = torch.unsqueeze(testDatas, dim=0)
testPowerConsum = torch.unsqueeze(testPowerConsum, dim=0)
'''build dataloader'''
trainDataLoader = DataLoader(
TensorDataset(
trainDatas, trainPowerConsum
),
shuffle=True, batch_size=batchSize, drop_last=True)
print('Data is ready')
seqLen = 2
inputDim = 8
hiddenSize = 3
numLayer = 2
learningRate = 0.01
class RNNModel(torch.nn.Module):
def __init__(self, inputsize, hiddensize, batchsize, numLayer):
super(RNNModel, self).__init__()
self.batchsize = batchsize
self.inputsize = inputsize
self.hiddensize = hiddensize
self.numlayers = numLayer
self.rnn = torch.nn.RNN(input_size=self.inputsize, hidden_size=self.hiddensize, num_layers=self.numlayers,
batch_first=True)
self.l1 = torch.nn.Linear(hiddenSize, hiddensize)
self.l2 = torch.nn.Linear(hiddenSize, 1)
def forward(self, input, hidden):
out, hidden = self.rnn(input.float(), hidden.float())
batch_size, seq_len, input_dim = out.shape
out = out.reshape(-1, input_dim)
# out = f.sigmoid(self.l1(out))
out = f.relu(self.l1(out))
out = self.l2(out)
out = out.reshape(batch_size, seq_len, -1)
return out, hidden
def initHidden(self):
hidden = torch.zeros(self.numlayers, self.batchsize, self.hiddensize, device=device, dtype=torch.float64)
return hidden
net = RNNModel(inputDim, hiddenSize, batchSize, numLayer).to(device)
criterion = torch.nn.L1Loss()
optimizer = optim.Adam(net.parameters(), lr=learningRate,momentum=0.01)
def train(epoch):
runLoss = 0.
optimizer.zero_grad()
hidden = net.initHidden()
for batchIndex, data in enumerate(trainDataLoader, 0):
inputs, target = data
optimizer.zero_grad()
outputs, hidden = net(inputs, hidden)
hidden = hidden.detach()
loss = criterion(outputs.float(), target.float())
loss = loss.mean()
loss.backward()
optimizer.step()
print(f'{epoch + 1},\t Loss={loss.item()}')
# torch.save(net.state_dict(), netPath)
def test():
testDatasVice = torch.clone(testDatas)
input = testDatasVice[:, 0, :]
input = input.view(1, 1, -1)
assert input.shape[2] == 8
predictPowConsum = list()
# the first hidden tensor in test set is zero
hidden = torch.zeros(2, 1, 3, device=device, dtype=torch.float64)
with torch.no_grad():
for i in range(testDatas.shape[1]):
output, hidden = net(input, hidden)
if i < 51:
testDatasVice[:, i + 1, -1] = output[0]
input = torch.unsqueeze(testDatasVice[:, i + 1, :], dim=0)
predictPowConsum.append(output.data.cpu().numpy().ravel()[0])
elif i == 51:
predictPowConsum.append(output.data.cpu().numpy().ravel()[0])
else:
print('\tindexError') # Exclude potential Errors
return predictPowConsum
if __name__ == '__main__':
epochNum = 300
for epoch in range(epochNum):
train(epoch)
predictPowConsum = test()
# plotting
xData = np.arange(303, 303 + testPowerConsum.size(1))
plt.plot(xData, testPowerConsum.cpu().numpy()[0, :, 0])
plt.plot(xData, predictPowConsum)
plt.show()
Code of Linear model
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
filePath = 'F:.csv'
initialData = pd.read_csv(filePath)
print(initialData.head(10))
print('hello world')
trainDatas = initialData.iloc[0:7, 1:300]
trainPowerConsum = initialData.iloc[-1, 1:300]
testDatas = initialData.iloc[0:7, 300:-1]
testPowerConsum = initialData.iloc[-1, 300:-1]
trainDatas = pd.DataFrame(trainDatas)
trainDatas = trainDatas.T
trainPowerConsum = pd.DataFrame(trainPowerConsum)
testDatas = pd.DataFrame(testDatas)
testDatas = testDatas.T
testPowerConsum = pd.DataFrame(testPowerConsum)
trainPowerConsum.iloc[:, 0] = trainPowerConsum.iloc[:, 0] * 1000
testPowerConsum.iloc[:, 0] = testPowerConsum.iloc[:, 0] * 1000
# build dataloader
trainData = DataLoader(
TensorDataset(
torch.tensor(trainDatas.values).float(),
torch.tensor(trainPowerConsum.values.astype(float)).float()
),
shuffle=True, batch_size=15)
testData = DataLoader(
TensorDataset(
torch.tensor(testDatas.values.astype(float)).float(),
torch.tensor(testPowerConsum.values.astype(float)).float()
),
shuffle=False, batch_size=15)
print('data is ready')
class SimpleNet(torch.nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
self.l1 = torch.nn.Linear(7, 15)
self.l2 = torch.nn.Linear(15, 30)
self.l3 = torch.nn.Linear(30, 15)
self.l4 = torch.nn.Linear(15, 5)
self.l5 = torch.nn.Linear(5, 1)
def forward(self, x):
x = f.relu(self.l1(x))
x = f.relu(self.l2(x))
x = f.relu(self.l3(x))
x = f.relu(self.l4(x))
return self.l5(x)
model = SimpleNet()
criterion = torch.nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)
def train(epoch):
runLoss = 0.
for batch_index, data in enumerate(trainData, 0):
inputs, target = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
runLoss += loss
print(f'{epoch + 1},{batch_index + 1},\tLoss={runLoss / 5}')
runLoss = 0
def test(epoch):
totalError = 0.
print('Start to test the model')
with torch.no_grad():
for data in testData:
# test ---------data for test
# testlab ---------corresponding power consumption
test, testlab = data
outputs = model(test)
predicted = outputs.data
testError = testlab - predicted
# plotting
if epoch % 50 == 2:
xData = np.linspace(1, 15, 15)
if predicted.size(0) != 15:
pass
else:
plt.plot(xData, predicted[:, 0].numpy(), label='predicted', color='red')
plt.plot(xData, testlab[:, 0].numpy(), label='origData', color='blue')
plt.show()
totalError += (torch.abs(testError).sum().item())
print(f'Average Error on test set is {totalError / 54}')
if __name__ == '__main__':
for epoch in range(1000):
train(epoch)
test(epoch)
Image of Output
output of RNN
The blue line is the actual data, and the orange line is the output of RNN model.
Solutions and its Effect
I have looked around and apparently I've got the choice between these solutions:
Add new domain-specific features
Decrease the amount of regularization used
Increase the duration of training
Increase the complexity or type of the model
Decrease the learning rate
Try other activate function
I have tried some solutions:
The data for trainning isn't regularized. I just change the unit of electricity from kWh to Wh
I take ReLu as activate function after using Sigmoid, but it doesn't work
I adjust the learning rate from 0.01 to 0.001, it doesn't improve
I try different optimizer such as SGD and Adam on both model, even use momentum, it doesn't get better
The sequence length of RNN model is 60 firstly, then is set to 3. The loss dropped more rapidly in the latter case, but the forecast result still is a straight line
In a word, all solutions I find doesn't work.
Besides, if shuffle is True when building DataLoader, the loss skips violently between epochs. But it drops slowly and close to an constant eventually when shuffle is False.
What could be the best way to avoid the problem?
Thanks in advance!
So I am training a CNN and compute the training accuracy for each batch. Most of the it gives out 100% batch training accuracy. which I though was okay because I'm testing my model against the data I trained it with. But at some iterations, I get a 90% or 90% batch training accuracy. And worst, sometimes it goes down to 0% real quick and bounces back to 100% batch training accuracy. And I used the algorithm in https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/04_Save_Restore.ipynb and they also computed the batch training accuracy but they don't get the same results I get. They started out with around 80% batch training accuracy and observed a gradual increase until 98%. Why is this?
I was suspecting that my network is overfitting.
Here is my exact code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import pyfftw
from scipy import signal
import xlrd
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib
import time
from datetime import timedelta
import math
import os
from sklearn.metrics import confusion_matrix
##matplotlib inline
plt.style.use('ggplot')
## define funtions
def read_data(file_path):
## column_names = ['user-id','activity','timestamp', 'x-axis', 'y-axis', 'z-axis']
column_names = ['activity','timestamp', 'Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz', 'Mx', 'My', 'Mz'] ## 3 sensors
data = pd.read_csv(file_path,header = None, names = column_names)
return data
def feature_normalize(dataset):
mu = np.mean(dataset,axis = 0)
sigma = np.std(dataset,axis = 0)
return (dataset - mu)/sigma
def plot_axis(ax, x, y, title):
ax.plot(x, y)
ax.set_title(title)
ax.xaxis.set_visible(False)
ax.set_ylim([min(y) - np.std(y), max(y) + np.std(y)])
ax.set_xlim([min(x), max(x)])
ax.grid(True)
def plot_activity(activity,data):
fig, (ax0, ax1, ax2) = plt.subplots(nrows = 3, figsize = (15, 10), sharex = True)
plot_axis(ax0, data['timestamp'], data['Ax'], 'x-axis')
plot_axis(ax1, data['timestamp'], data['Ay'], 'y-axis')
plot_axis(ax2, data['timestamp'], data['Az'], 'z-axis')
plt.subplots_adjust(hspace=0.2)
fig.suptitle(activity)
plt.subplots_adjust(top=0.90)
plt.show()
def windows(data, size):
start = 0
while start < data.count():
yield start, start + size
start += (size / 2)
def segment_signal(data, window_size = None, num_channels=None): # edited
segments = np.empty((0,window_size,num_channels)) #change from 3 to 9 channels for AGM fusion #use variable num_channels=9
labels = np.empty((0))
for (n_start, n_end) in windows(data['timestamp'], window_size):
## x = data["x-axis"][start:end]
## y = data["y-axis"][start:end]
## z = data["z-axis"][start:end]
n_start = int(n_start)
n_end = int(n_end)
Ax = data["Ax"][n_start:n_end]
Ay = data["Ay"][n_start:n_end]
Az = data["Az"][n_start:n_end]
Gx = data["Gx"][n_start:n_end]
Gy = data["Gy"][n_start:n_end]
Gz = data["Gz"][n_start:n_end]
Mx = data["Mx"][n_start:n_end]
My = data["My"][n_start:n_end]
Mz = data["Mz"][n_start:n_end]
if(len(dataset['timestamp'][n_start:n_end]) == window_size): # include only windows with size of 90
segments = np.vstack([segments,np.dstack([Ax,Ay,Az,Gx,Gy,Gz,Mx,My,Mz])])
labels = np.append(labels,stats.mode(data["activity"][n_start:n_end])[0][0])
return segments, labels
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev = 0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.0, shape = shape)
return tf.Variable(initial)
def depthwise_conv2d(x, W):
return tf.nn.depthwise_conv2d(x,W, [1, 1, 1, 1], padding='VALID')
def apply_depthwise_conv(x,weights,biases):
return tf.nn.relu(tf.add(depthwise_conv2d(x, weights),biases))
def apply_max_pool(x,kernel_size,stride_size):
return tf.nn.max_pool(x, ksize=[1, 1, kernel_size, 1],
strides=[1, 1, stride_size, 1], padding='VALID')
#------------------------get dataset----------------------#
## run shoaib_dataset.py to generate dataset_shoaib_total.txt
## get data from dataset_shoaib_total.txt
dataset = read_data('dataset_shoaib_total.txt')
#--------------------preprocessing------------------------#
dataset['Ax'] = feature_normalize(dataset['Ax'])
dataset['Ay'] = feature_normalize(dataset['Ay'])
dataset['Az'] = feature_normalize(dataset['Az'])
dataset['Gx'] = feature_normalize(dataset['Gx'])
dataset['Gy'] = feature_normalize(dataset['Gy'])
dataset['Gz'] = feature_normalize(dataset['Gz'])
dataset['Mx'] = feature_normalize(dataset['Mx'])
dataset['My'] = feature_normalize(dataset['My'])
dataset['Mz'] = feature_normalize(dataset['Mz'])
###--------------------plot activity data----------------#
##for activity in np.unique(dataset["activity"]):
## subset = dataset[dataset["activity"] == activity][:180]
## plot_activity(activity,subset)
#------------------fixed hyperparameters--------------------#
window_size = 200 #from 90 #FIXED at 4 seconds
#----------------input hyperparameters------------------#
input_height = 1
input_width = window_size
num_labels = 6
num_channels = 9 #from 3 channels #9 channels for AGM
#-------------------sliding time window----------------#
segments, labels = segment_signal(dataset, window_size=window_size, num_channels=num_channels)
labels = np.asarray(pd.get_dummies(labels), dtype = np.int8)
reshaped_segments = segments.reshape(len(segments), (window_size*num_channels)) #use variable num_channels instead of constant 3 channels
#------------divide data into test and training set-----------#
train_test_split = np.random.rand(len(reshaped_segments)) < 0.80
train_x_init = reshaped_segments[train_test_split]
train_y_init = labels[train_test_split]
test_x = reshaped_segments[~train_test_split]
test_y = labels[~train_test_split]
train_validation_split = np.random.rand(len(train_x_init)) < 0.80
train_x = train_x_init[train_validation_split]
train_y = train_y_init[train_validation_split]
validation_x = train_x_init[~train_validation_split]
validation_y = train_y_init[~train_validation_split]
#---------------training hyperparameters----------------#
batch_size = 10
kernel_size = 60 #from 60 #optimal 2
depth = 15 #from 60 #optimal 15
num_hidden = 1000 #from 1000 #optimal 80
learning_rate = 0.0001
training_epochs = 8
total_batches = train_x.shape[0] ##// batch_size
#---------define placeholders for input----------#
X = tf.placeholder(tf.float32, shape=[None,input_width * num_channels], name="input")
X_reshaped = tf.reshape(X,[-1,input_height,input_width,num_channels])
Y = tf.placeholder(tf.float32, shape=[None,num_labels])
#---------------------perform convolution-----------------#
# first convolutional layer
c_weights = weight_variable([1, kernel_size, num_channels, depth])
c_biases = bias_variable([depth * num_channels])
c = apply_depthwise_conv(X_reshaped,c_weights,c_biases)
p = apply_max_pool(c,20,2)
# second convolutional layer
c2_weights = weight_variable([1, 6,depth*num_channels,depth//10])
c2_biases = bias_variable([(depth*num_channels)*(depth//10)])
c = apply_depthwise_conv(p,c2_weights,c2_biases)
#--------------flatten data for fully connected layers----------#
shape = c.get_shape().as_list()
c_flat = tf.reshape(c, [-1, shape[1] * shape[2] * shape[3]])
#------------fully connected layers----------------#
f_weights_l1 = weight_variable([shape[1] * shape[2] * depth * num_channels * (depth//10), num_hidden])
f_biases_l1 = bias_variable([num_hidden])
f = tf.nn.tanh(tf.add(tf.matmul(c_flat, f_weights_l1),f_biases_l1))
#----------------------dropout------------------#
keep_prob = tf.placeholder(tf.float32)
drop_layer = tf.nn.dropout(f, keep_prob)
#----------------------softmax layer----------------#
out_weights = weight_variable([num_hidden, num_labels])
out_biases = bias_variable([num_labels])
y_ = tf.nn.softmax(tf.add(tf.matmul(drop_layer, out_weights),out_biases), name="y_")
#-----------------loss optimization-------------#
loss = -tf.reduce_sum(Y * tf.log(y_))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
#-----------------compute accuracy---------------#
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
cost_history = np.empty(shape=[1],dtype=float)
saver = tf.train.Saver()
session = tf.Session()
session.run(tf.global_variables_initializer())
#-------------early stopping-----------------#
# Best validation accuracy seen so far.
best_validation_accuracy = 0.0
# Iteration-number for last improvement to validation accuracy.
last_improvement = 0
# Stop optimization if no improvement found in this many iterations.
require_improvement = 1000
# Counter for total number of iterations performed so far.
total_iterations = 0
def validation_accuracy():
return session.run(accuracy, feed_dict={X: validation_x, Y: validation_y, keep_prob: 1.0})
def next_batch(b, batch_size, train_x, train_y):
##for b in range(total_batches):
offset = (b * batch_size) % (train_y.shape[0] - batch_size)
batch_x = train_x[offset:(offset + batch_size), :]
batch_y = train_y[offset:(offset + batch_size), :]
return batch_x, batch_y
def optimize(num_iterations):
# Ensure we update the global variables rather than local copies.
global total_iterations
global best_validation_accuracy
global last_improvement
# Start-time used for printing time-usage below.
start_time = time.time()
for i in range(num_iterations):
# Increase the total number of iterations performed.
# It is easier to update it in each iteration because
# we need this number several times in the following.
total_iterations += 1
# Get a batch of training examples.
# x_batch now holds a batch of images and
# y_true_batch are the true labels for those images.
##x_batch, y_true_batch = data.train.next_batch(train_batch_size)
x_batch, y_true_batch = next_batch(i, batch_size, train_x, train_y)
# Put the batch into a dict with the proper names
# for placeholder variables in the TensorFlow graph.
feed_dict_train = {X: x_batch,
Y: y_true_batch, keep_prob: 0.5}
# Run the optimizer using this batch of training data.
# TensorFlow assigns the variables in feed_dict_train
# to the placeholder variables and then runs the optimizer.
session.run(optimizer, feed_dict=feed_dict_train)
# Print status every 100 iterations and after last iteration.
if (total_iterations % 100 == 0) or (i == (num_iterations - 1)):
# Calculate the accuracy on the training-batch.
acc_train = session.run(accuracy, feed_dict={X: x_batch,
Y: y_true_batch, keep_prob: 1.0})
# Calculate the accuracy on the validation-set.
# The function returns 2 values but we only need the first.
##acc_validation, _ = validation_accuracy()
acc_validation = validation_accuracy()
# If validation accuracy is an improvement over best-known.
if acc_validation > best_validation_accuracy:
# Update the best-known validation accuracy.
best_validation_accuracy = acc_validation
# Set the iteration for the last improvement to current.
last_improvement = total_iterations
# Save all variables of the TensorFlow graph to file.
saver.save(sess=session, save_path="../shoaib-har_agm_es.ckpt")
# A string to be printed below, shows improvement found.
improved_str = '*'
else:
# An empty string to be printed below.
# Shows that no improvement was found.
improved_str = ''
# Status-message for printing.
msg = "Iter: {0:>6}, Train-Batch Accuracy: {1:>6.1%}, Validation Acc: {2:>6.1%} {3}"
# Print it.
print(msg.format(i + 1, acc_train, acc_validation, improved_str))
# If no improvement found in the required number of iterations.
if total_iterations - last_improvement > require_improvement:
print("No improvement found in a while, stopping optimization.")
# Break out from the for-loop.
break
# Ending time.
end_time = time.time()
# Difference between start and end-times.
time_dif = end_time - start_time
# Print the time-usage.
print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))
optimize(10000)
With the output:
What exactly is training accuracy? Is it even computed? Or do you compute the training accuracy on the entire training data and not just the batch you trained your network with?
Here I printed the results such that it prints out the batch training accuracy and the training accuracy on the entire dataset set for every multiples of 20 iterations.
The data is divided to 3 sets: train, validation and test.
Batch training accuracy is computed on the train set (the difference between the label and the prediction).
Validation accuracy is the accuracy on the validation set.
The batch accuracy can be computed just after a forward pass in the network. The number of samples in one forward pass is the batch size. It is just a way to train models faster (mini-batch gradient descent)
Overfitting is when the model works really good on known data (training set) but performs poorly on new data.
As to the 10% multiples, it is just the printing format you are using.
I have a data set which contains a list of stock prices. I need to use the tensorflow and python to predict the close price.
Q1: I have the following code which takes the first 2000 records as training and 2001 to 20000 records as test but I don't know how to change the code to do the prediction of the close price of today and 1 day later??? Please advise!
#!/usr/bin/env python2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
def feature_scaling(input_pd, scaling_meathod):
if scaling_meathod == 'z-score':
scaled_pd = (input_pd - input_pd.mean()) / input_pd.std()
elif scaling_meathod == 'min-max':
scaled_pd = (input_pd - input_pd.min()) / (input_pd.max() -
input_pd.min())
return scaled_pd
def input_reshape(input_pd, start, end, batch_size, batch_shift, n_features):
temp_pd = input_pd[start-1: end+batch_size-1]
output_pd = map(lambda y : temp_pd[y:y+batch_size], xrange(0, end-start+1, batch_shift))
output_temp = map(lambda x : np.array(output_pd[x]).reshape([-1]), xrange(len(output_pd)))
output = np.reshape(output_temp, [-1, batch_size, n_features])
return output
def target_reshape(input_pd, start, end, batch_size, batch_shift, n_step_ahead, m_steps_pred):
temp_pd = input_pd[start+batch_size+n_step_ahead-2: end+batch_size+n_step_ahead+m_steps_pred-2]
print temp_pd
output_pd = map(lambda y : temp_pd[y:y+m_steps_pred], xrange(0, end-start+1, batch_shift))
output_temp = map(lambda x : np.array(output_pd[x]).reshape([-1]), xrange(len(output_pd)))
output = np.reshape(output_temp, [-1,1])
return output
def lstm(input, n_inputs, n_steps, n_of_layers, scope_name):
num_layers = n_of_layers
input = tf.transpose(input,[1, 0, 2])
input = tf.reshape(input,[-1, n_inputs])
input = tf.split(0, n_steps, input)
with tf.variable_scope(scope_name):
cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_inputs)
cell = tf.nn.rnn_cell.MultiRNNCell([cell]*num_layers)
output, state = tf.nn.rnn(cell, input, dtype=tf.float32) yi1
output = output[-1]
return output
feature_to_input = ['open price', 'highest price', 'lowest price', 'close price','turnover', 'volume','mean price']
feature_to_predict = ['close price']
feature_to_scale = ['volume']
sacling_meathod = 'min-max'
train_start = 1
train_end = 1000
test_start = 1001
test_end = 20000
batch_size = 100
batch_shift = 1
n_step_ahead = 1
m_steps_pred = 1
n_features = len(feature_to_input)
lstm_scope_name = 'lstm_prediction'
n_lstm_layers = 1
n_pred_class = 1
learning_rate = 0.1
EPOCHS = 1000
PRINT_STEP = 100
read_data_pd = pd.read_csv('./stock_price.csv')
temp_pd = feature_scaling(input_pd[feature_to_scale],sacling_meathod)
input_pd[feature_to_scale] = temp_pd
train_input_temp_pd = input_pd[feature_to_input]
train_input_nparr = input_reshape(train_input_temp_pd,
train_start, train_end, batch_size, batch_shift, n_features)
train_target_temp_pd = input_pd[feature_to_predict]
train_target_nparr = target_reshape(train_target_temp_pd, train_start, train_end, batch_size, batch_shift, n_step_ahead, m_steps_pred)
test_input_temp_pd = input_pd[feature_to_input]
test_input_nparr = input_reshape(test_input_temp_pd, test_start, test_end, batch_size, batch_shift, n_features)
test_target_temp_pd = input_pd[feature_to_predict]
test_target_nparr = target_reshape(test_target_temp_pd, test_start, test_end, batch_size, batch_shift, n_step_ahead, m_steps_pred)
tf.reset_default_graph()
x_ = tf.placeholder(tf.float32, [None, batch_size, n_features])
y_ = tf.placeholder(tf.float32, [None, 1])
lstm_output = lstm(x_, n_features, batch_size, n_lstm_layers, lstm_scope_name)
W = tf.Variable(tf.random_normal([n_features, n_pred_class]))
b = tf.Variable(tf.random_normal([n_pred_class]))
y = tf.matmul(lstm_output, W) + b
cost_func = tf.reduce_mean(tf.square(y - y_))
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_func)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
for ii in range(EPOCHS):
sess.run(train_op, feed_dict={x_:train_input_nparr, y_:train_target_nparr})
if ii % PRINT_STEP == 0:
cost = sess.run(cost_func, feed_dict={x_:train_input_nparr, y_:train_target_nparr})
print 'iteration =', ii, 'training cost:', cost
Very simply, prediction (a.k.a. scoring or inference) comes from running the input through only the forward pass, and collecting the score for each input vector. It's the same process flow as testing. The difference is the four stages of model use:
Train: learn from the training data set; adjust weights as needed.
Test: evaluate the model's performance; if accuracy has converged, stop training.
Validate: evaluate the accuracy of the trained model. If it doesn't meet acceptance criteria, change something and start over with the training.
Predict: you've passed validation -- release the model for use by the intended application.
All four steps follow the same forward logic flow; training includes back-propagation; the others do not. Simply follow the forward-only process, and you'll get the result form you need.
I worry about your data partition: only 10% for training, 90% for testing, and none for validation. A more typical split is 50-30-20, or something in that general area.
Q-1 : You should change your LSTM parameter to return a sequence of size two which will be prediction for that day and the day after.
Q-2 it's clearly that your model is underfitting the data, which is so obvious with your 10% train 90% test data ! You should more equilibrated ratio as suggested in the previous answer.
I'm having a difficult time visualizing what this Tensorflow class creates. I want to implement a LSTM RNN that handles 3D data.
class Grid3LSTMCell(GridRNNCell):
"""3D BasicLSTM cell
This creates a 2D cell which receives input and gives output in the first dimension.
The first dimension can optionally be non-recurrent if `non_recurrent_fn` is specified.
The second and third dimensions are LSTM.
"""
def __init__(self, num_units, tied=False, non_recurrent_fn=None,
use_peepholes=False, forget_bias=1.0):
super(Grid3LSTMCell, self).__init__(num_units=num_units, num_dims=3,
input_dims=0, output_dims=0, priority_dims=0, tied=tied,
non_recurrent_dims=None if non_recurrent_fn is None else 0,
cell_fn=lambda n, i: rnn_cell.LSTMCell(
num_units=n, input_size=i, forget_bias=forget_bias,
use_peepholes=use_peepholes),
non_recurrent_fn=non_recurrent_fn)
The class is found in `from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell`.
This is difficult to explain, so I've provided a drawing. Here is what I want it to do...
However the comment sounds like it isn't doing this. The comment makes it sound like the RNN is still a flat RNN, where the first dimension is outputting to, what is commonly called, the outputs variable (see below). The second dimension is outputting to the next step in the RNN, and the third dimension is outputting to the next hidden layer.
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
If this is the case, what is the point in having the first and second dimensions? Aren't they essentially the same thing? The BasicLSTMCell sends the output to the next step into outputs -- in other words they are one in the same.
Clarity?
For reference, here is my example code...
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
import numpy as np
#define parameters
learning_rate = 0.01
batch_size = 2
n_input_x = 10
n_input_y = 10
n_input_z = 10
n_hidden = 128
n_classes = 2
n_output = n_input_x * n_classes
x = tf.placeholder("float", [n_input_x, n_input_y, n_input_z])
y = tf.placeholder("float", [n_input_x, n_input_y, n_input_z, n_classes])
weights = {}
biases = {}
for i in xrange(n_input_y * n_input_z):
weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
biases[i] = tf.Variable(tf.random_normal([n_output]))
#generate random data
input_data = np.random.rand(n_input_x, n_input_y, n_input_z)
ground_truth = np.random.rand(n_input_x, n_input_y, n_input_z, n_classes)
#build GridLSTM
def GridLSTM_network(x):
x = tf.reshape(x, [-1,n_input_x])
x = tf.split(0, n_input_y * n_input_z, x)
lstm_cell = grid_rnn_cell.Grid3LSTMCell(n_hidden)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
output = []
for i in xrange(n_input_y * n_input_z):
output.append(tf.matmul(outputs[i], weights[i]) + biases[i])
return output
#initialize network, cost, optimizer and all variables
pred = GridLSTM_network(x)
# import pdb
# pdb.set_trace()
pred = tf.pack(pred)
pred = tf.transpose(pred,[1,0,2])
pred= tf.reshape(pred, [-1, n_input_x, n_input_y, n_input_z, n_classes])
temp_pred = tf.reshape(pred, [-1,n_classes])
temp_y = tf.reshape(y,[-1, n_classes])
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(temp_pred, temp_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(0,tf.cast(tf.sub(tf.nn.sigmoid(temp_pred),temp_y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 0
while 1:
print step
step = step + 1
# pdb.set_trace
sess.run(optimizer, feed_dict={x: input_data, y: ground_truth})
I was reading the original paper on BN and the stack overflow question on How could I use Batch Normalization in TensorFlow? which provides a very useful piece of code to insert a batch normalization block to a Neural Network but does not provides enough guidance on how to actually use it during training, inference and when evaluating models.
For example, I would like to track the train error during training and test error to make sure I don't overfit. Its clear that the batch normalization block should be off during test, but when evaluating the error on the training set, should the batch normalization block be turned off too? My main questions are:
During inference and error evaluation, should the batch normalization block be turned off regardless of the data set?
Does that mean that the batch normalization block should only be on during the training step then?
To make it very clear, I will provide an extract (of simplified) code I have been using to run batch normalization with Tensor flow according to what is my understanding of what is the right thing to do:
## TRAIN
if phase_train is not None:
#DO BN
feed_dict_train = {x:X_train, y_:Y_train, phase_train: False}
feed_dict_cv = {x:X_cv, y_:Y_cv, phase_train: False}
feed_dict_test = {x:X_test, y_:Y_test, phase_train: False}
else:
#Don't do BN
feed_dict_train = {x:X_train, y_:Y_train}
feed_dict_cv = {x:X_cv, y_:Y_cv}
feed_dict_test = {x:X_test, y_:Y_test}
def get_batch_feed(X, Y, M, phase_train):
mini_batch_indices = np.random.randint(M,size=M)
Xminibatch = X[mini_batch_indices,:] # ( M x D^(0) )
Yminibatch = Y[mini_batch_indices,:] # ( M x D^(L) )
if phase_train is not None:
#DO BN
feed_dict = {x: Xminibatch, y_: Yminibatch, phase_train: True}
else:
#Don't do BN
feed_dict = {x: Xminibatch, y_: Yminibatch}
return feed_dict
with tf.Session() as sess:
sess.run( tf.initialize_all_variables() )
for iter_step in xrange(steps):
feed_dict_batch = get_batch_feed(X_train, Y_train, M, phase_train)
# Collect model statistics
if iter_step%report_error_freq == 0:
train_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_train)
cv_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_cv)
test_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_test)
do_stuff_with_errors(train_error, cv_error, test_error)
# Run Train Step
sess.run(fetches=train_step, feed_dict=feed_dict_batch)
and the code I am using to produce batch normalization blocks is:
def standard_batch_norm(l, x, n_out, phase_train, scope='BN'):
"""
Batch normalization on feedforward maps.
Args:
x: Vector
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope+l):
#beta = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float64 ), name='beta', trainable=True, dtype=tf.float64 )
#gamma = tf.Variable(tf.constant(1.0, shape=[n_out],dtype=tf.float64 ), name='gamma', trainable=True, dtype=tf.float64 )
init_beta = tf.constant(0.0, shape=[n_out], dtype=tf.float64)
init_gamma = tf.constant(1.0, shape=[n_out],dtype=tf.float64)
beta = tf.get_variable(name='beta'+l, dtype=tf.float64, initializer=init_beta, regularizer=None, trainable=True)
gamma = tf.get_variable(name='gamma'+l, dtype=tf.float64, initializer=init_gamma, regularizer=None, trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
I found that there is 'official' batch_norm layer in tensorflow. Try it out:
https://github.com/tensorflow/tensorflow/blob/b826b79718e3e93148c3545e7aa3f90891744cc0/tensorflow/contrib/layers/python/layers/layers.py#L100
Most likely it is not mentioned in docs since it included in some RC or 'beta' version only.
I haven't inspected deep into this matter yet, but as far as I see from documentation you just use binary parameter is_training in this batch_norm layer, and set it to true only for training phase. Try it out.
UPDATE: Below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine.
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))