LSTM history length vs prediction error - machine-learning

I use LSTM to predict next step voltage value in voltage time series signal. I have a question:
Why using longer sequences (5 or 10 steps back in time) to train LSTM does not improve prediction and reduce prediction error ? (it actually degrades it - see the figures e.g. results for sequence_length=5 is better than sequence_length=10)
testplot('epochs: 10', 'ratio: 1', 'sequence_length: 10', 'mean error: ', '0.00116802704509')
testplot('epochs: 10', 'ratio: 1', 'sequence_length: 5', 'mean error: ', '0.000495359163296'
(predicted signal in green, real in red)
import os
import matplotlib.pyplot as plt
import numpy as np
import time
import csv
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
np.random.seed(1234)
def data_power_consumption(path_to_dataset,
sequence_length=50,
ratio=1.0):
max_values = ratio * 2049280
with open(path_to_dataset) as f:
data = csv.reader(f, delimiter=",")
power = []
nb_of_values = 0
for line in data:
try:
power.append(float(line[4]))
nb_of_values += 1
except ValueError:
pass
# 2049280.0 is the total number of valid values, i.e. ratio = 1.0
if nb_of_values >= max_values:
print "max value", nb_of_values
break
print "Data loaded from csv. Formatting..."
result = []
for index in range(len(power) - sequence_length):
result.append(power[index: index + sequence_length])
result = np.array(result) # shape (2049230, 50)
result_mean = result.mean()
result -= result_mean
print "Shift : ", result_mean
print "Data : ", result.shape
row = round(0.9 * result.shape[0])
train = result[:row, :]
np.random.shuffle(train)
X_train = train[:, :-1]
y_train = train[:, -1]
X_test = result[row:, :-1]
y_test = result[row:, -1]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
return [X_train, y_train, X_test, y_test]
def build_model():
model = Sequential()
layers = [1, 50, 100, 1]
model.add(LSTM(
input_dim=layers[0],
output_dim=layers[1],
return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
layers[2],
return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(
output_dim=layers[3]))
model.add(Activation("linear"))
start = time.time()
model.compile(loss="mse", optimizer="adam") # consider adam
print "Compilation Time : ", time.time() - start
return model
def run_network(model=None, data=None):
global_start_time = time.time()
epochs = 10
ratio = 1
sequence_length = 3
path_to_dataset = 'TIMBER_DATA_1.csv'
if data is None:
print 'Loading data... '
X_train, y_train, X_test, y_test = data_power_consumption(
path_to_dataset, sequence_length, ratio)
else:
X_train, y_train, X_test, y_test = data
print '\nData Loaded. Compiling...\n'
if model is None:
model = build_model()
try:
model.fit(
X_train, y_train,
batch_size=512, nb_epoch=epochs, validation_split=0.05)
predicted = model.predict(X_test)
predicted = np.reshape(predicted, (predicted.size,))
print "done"
except KeyboardInterrupt:
print 'Training duration (s) : ', time.time() - global_start_time
return model, y_test, 0
try:
fig, ax = plt.subplots()
txt = "epochs: " + str(epochs), "ratio: " + str(ratio), "sequence_length: " + str(sequence_length)
# calculate error (shift predicted by "sequence_length - 1 and apply mean with abs)
y_test_mean = y_test - np.mean(y_test)
y_test_mean_shifted = y_test_mean[:-1*(sequence_length - 1)]
predicted_mean = predicted - np.mean(predicted)
predicted_mean_shifted = predicted_mean[(sequence_length - 1):]
prediction_error = np.mean(abs(y_test_mean_shifted - predicted_mean_shifted))
text_mean = "mean error: ", str(prediction_error)
txt = txt + text_mean
# Now add the legend with some customizations.
legend = ax.legend(loc='upper center', shadow=True)
ax.plot(y_test_mean_shifted[900:1000], 'r--', label='Real data')
ax.plot(predicted_mean_shifted[900:1000], 'g:', label='Predicted')
fig.text(0.4, 0.2, txt, horizontalalignment='center', verticalalignment='center', transform = ax.transAxes)
plt.savefig(os.path.join('cern_figures', 'testplot' + str(txt) + '.png'))
plt.show()
except Exception as e:
print str(e)
print 'Training duration (s) : ', time.time() - global_start_time
return model, y_test, predicted
# main
if __name__ == "__main__":
_, y_test_out, predicted_out = run_network()
#y_test_out_mean = y_test_out - np.mean(y_test_out)
#predicted_out_mean = predicted_out - np.mean(predicted_out)

maybe because your time series at time t does not depend on your time series at time t-10. If you have a time series (x1,...,xn) and there is no link between xn and xn-p, there is no reason to use a step back of p.
For example if you want to predict the weather one hour ahead, you will not use a step back of 2 weeks. Why ? Because the weather of 2 weeks in the past has no influence on the weather right now. You will use instead the weather of the last hour (or last day).
Ps : I use this example of weather forcasting because there is to me no link between weather two weeks in the past and now. But maybe an expert in weather forcast would prove me wrong !
Cheers !

Related

How to fix this loss is NaN problem in PyTorch of this RNN with GRU?

I'm completely new to PyTorch and tried out some models. I wanted to make an easy prediction rnn of stock market prices and found the following code:
I load the data set with pandas then split it into training and test data and load it into a pytorch DataLoader for later usage in training process. The model is defined in the GRU class. But the actual problem seems to be the optimisation. I think the problem could be gradient explosion. I thought about adding gradient clipping but the GRU design should actually prevent gradient explosion or am I wrong? What could cause the loss to be instantly NaN (already in the first epoch)
from sklearn.preprocessing import MinMaxScaler
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
batch_size = 200
input_dim = 1
hidden_dim = 32
num_layers = 2
output_dim = 1
num_epochs = 10
nvda = pd.read_csv('dataset/stocks/NVDA.csv')
price = nvda[['Close']]
scaler = MinMaxScaler(feature_range=(-1, 1))
price['Close'] = scaler.fit_transform(price['Close'].values.reshape(-1, 1))
def split_data(stock, lookback):
data_raw = stock.to_numpy() # convert to numpy array
data = []
# create all possible sequences of length seq_len
for index in range(len(data_raw) - lookback):
data.append(data_raw[index: index + lookback])
data = np.array(data)
test_set_size = int(np.round(0.2 * data.shape[0]))
train_set_size = data.shape[0] - (test_set_size)
x_train = data[:train_set_size, :-1, :]
y_train = data[:train_set_size, -1, :]
x_test = data[train_set_size:, :-1]
y_test = data[train_set_size:, -1, :]
return [x_train, y_train, x_test, y_test]
lookback = 20 # choose sequence length
x_train, y_train, x_test, y_test = split_data(price, lookback)
train_data = TensorDataset(torch.from_numpy(x_train).float(), torch.from_numpy(y_train).float())
train_data = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_data = TensorDataset(torch.from_numpy(x_test).float(), torch.from_numpy(y_test).float())
test_data = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)
class GRU(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
super(GRU, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
def forward(self, x, h):
out, h = self.gru(x, h)
out = self.fc(self.relu(out[:, -1]))
return out, h
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = weight.new(self.num_layers, batch_size, self.hidden_dim).zero_()
return hidden
model = GRU(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0000000001)
model.train()
start_time = time.time()
h = model.init_hidden(batch_size)
for epoch in range(1, num_epochs+1):
for x, y in train_data:
h = h.data
model.zero_grad()
y_train_pred, h = model(x, h)
loss = criterion(y_train_pred, y)
print("Epoch ", epoch, "MSE: ", loss.item())
loss.backward()
optimizer.step()
training_time = time.time() - start_time
print("Training time: {}".format(training_time))
This is the dataset which I used.
Not sure if it is the case, but did you preprocess and cleaned the data? I do not know it but maybe there are some values missing or it's something strange about it. I checked it here
https://ca.finance.yahoo.com/quote/NVDA/history?p=NVDA and it seems that every couple of rows there is some inconsistency. Like I said, I do not know if it's the case but it may be.

How can I improve my Neural Network accucary ( Speaker Recognition - MFCC )

Im working on a speaker recognition Neural Network.
What I am doing is taking wav files [ of the Bing Bang Theory first espiode :-) ], than convert it to MFCC coeffs than I make it as an input to an open source api of Neural Network (MLPClassifier) and as output I define a unique vector to each speaker ( Let's say : [1,0,0,0] - sheldon; [0,1,0,0] - Penny; and ect... ), I take 50 random values for testing and the others for fitting ( training )
This is my code, At the begining I got about random accucary for the NN but after some help of amazing guy I improved it to ~42% but I want more :) about 70% :
from sklearn.neural_network import MLPClassifier
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
from os import listdir
from os.path import isfile, join
from random import shuffle
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import randint
import random
winner = [] # this array count how much Bingo we had when we test the NN
random_winner = []
win_len = 0.04 # in seconds
step = win_len / 2
nfft = 2048
for TestNum in tqdm(range(20)): # in every round we build NN with X,Y that out of them we check 50 after we build the NN
X = []
Y = []
onlyfiles = [f for f in listdir("FinalAudios/") if isfile(join("FinalAudios/", f))] # Files in dir
names = [] # names of the speakers
for file in onlyfiles: # for each wav sound
# UNESSECERY TO UNDERSTAND THE CODE
if " " not in file.split("_")[0]:
names.append(file.split("_")[0])
else:
names.append(file.split("_")[0].split(" ")[0])
only_speakers = [] + names
#print only_speakers
names = list(dict.fromkeys(names)) # names of speakers
print names
vector_names = [] # vector for each name
i = 0
vector_for_each_name = [0] * len(names)
for name in names:
vector_for_each_name[i] += 1
vector_names.append(np.array(vector_for_each_name))
vector_for_each_name[i] -= 1
i += 1
for f in onlyfiles:
if " " not in f.split("_")[0]:
f_speaker = f.split("_")[0]
else:
f_speaker = f.split("_")[0].split(" ")[0]
fs, audio = wav.read("FinalAudios/" + f) # read the file
try:
mfcc_feat = python_speech_features.mfcc(audio, samplerate=fs, winlen=win_len,
winstep=step, nfft=nfft, appendEnergy=False)
flat_list = [item for sublist in mfcc_feat for item in sublist]
X.append(np.array(flat_list))
Y.append(np.array(vector_names[names.index(f_speaker)]))
except IndexError:
pass
Z = list(zip(X, Y))
shuffle(Z) # WE SHUFFLE X,Y TO PERFORM RANDOM ON THE TEST LEVEL
X, Y = zip(*Z)
X = list(X)
Y = list(Y)
X = np.asarray(X)
Y = np.asarray(Y)
Y_test = Y[:50] # CHOOSE 50 FOR TEST, OTHERS FOR TRAIN
X_test = X[:50]
X = X[50:]
Y = Y[50:]
print len(X)
clf = MLPClassifier(solver='lbfgs', alpha=3e-2, hidden_layer_sizes=(50, 20), random_state=2) # create the NN
clf.fit(X, Y) # Train it
print list(clf.predict_proba([X[0]])[0])
print list(Y_test[0])
for sample in range(len(X_test)): # add 1 to winner array if we correct and 0 if not, than in the end it plot it
arr = list(clf.predict([X_test[sample]])[0])
if arr.index(max(arr)) == list(Y_test[sample]).index(1):
winner.append(1)
else:
winner.append(0)
if only_speakers[randint(0, len(only_speakers) - 1)] == only_speakers[randint(0, len(only_speakers) - 1)]:
random_winner.append(1)
else:
random_winner.append(0)
# plot winner
plot_x = []
plot_y = []
for i in range(1, len(winner)):
plot_y.append(sum(winner[0:i])*1.0/len(winner[0:i]))
plot_x.append(i)
plot_random_x = []
plot_random_y = []
for i in range(1, len(random_winner)):
plot_random_y.append(sum(random_winner[0:i])*1.0/len(random_winner[0:i]))
plot_random_x.append(i)
plt.plot(plot_x, plot_y, 'r', label='machine learning')
plt.plot(plot_random_x, plot_random_y, 'b', label='random')
plt.xlabel('Number Of Samples')
# naming the y axis
plt.ylabel('Success Rate')
# giving a title to my graph
plt.title('Success Rate : Random Vs ML!')
# function to show the plot
plt.show()
This is my zip file that contains the code and the audio file : https://ufile.io/eggjm1gw
Somebody have an idea how can I improve my accucary?
Edit :
I improved my data set and put convolution model and got 60% accucarry, which is ok but also not good enoguh
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
from os import listdir
import os
import shutil
from os.path import isfile, join
from random import shuffle
from matplotlib import pyplot
from tqdm import tqdm
from random import randint
import tensorflow as tf
from ast import literal_eval as str2arr
from tempfile import TemporaryFile
#win_len = 0.04 # in seconds
#step = win_len / 2
#nfft = 2048
win_len = 0.05 # in seconds
step = win_len
nfft = 16384
results = []
outfile_x = None
outfile_y = None
winner = []
for TestNum in tqdm(range(40)): # We check it several times
if not outfile_x: # if path not exist we create it
X = [] # inputs
Y = [] # outputs
onlyfiles = [f for f in listdir("FinalAudios") if isfile(join("FinalAudios", f))] # Files in dir
names = [] # names of the speakers
for file in onlyfiles: # for each wav sound
# UNESSECERY TO UNDERSTAND THE CODE
if " " not in file.split("_")[0]:
names.append(file.split("_")[0])
else:
names.append(file.split("_")[0].split(" ")[0])
only_speakers = [] + names
namesWithoutDuplicate = list(dict.fromkeys(names))
namesWithoutDuplicateCopy = namesWithoutDuplicate[:]
for name in namesWithoutDuplicateCopy: # we remove low samples files
if names.count(name) < 107:
namesWithoutDuplicate.remove(name)
names = namesWithoutDuplicate
print(names) # print it
vector_names = [] # output for each name
i = 0
for name in names:
vector_for_each_name = i
vector_names.append(np.array(vector_for_each_name))
i += 1
for f in onlyfiles: # for all the files
if " " not in f.split("_")[0]:
f_speaker = f.split("_")[0]
else:
f_speaker = f.split("_")[0].split(" ")[0]
if f_speaker in namesWithoutDuplicate:
fs, audio = wav.read("FinalAudios\\" + f) # read the file
try:
# compute MFCC
mfcc_feat = python_speech_features.mfcc(audio, samplerate=fs, winlen=win_len, winstep=step, nfft=nfft, appendEnergy=False)
#flat_list = [item for sublist in mfcc_feat for item in sublist]
# Create output + inputs
for i in mfcc_feat:
X.append(np.array(i))
Y.append(np.array(vector_names[names.index(f_speaker)]))
except IndexError:
pass
else:
if not os.path.exists("TooLowSamples"): # if path not exist we create it
os.makedirs("TooLowSamples")
shutil.move("FinalAudios\\" + f, "TooLowSamples\\" + f)
outfile_x = TemporaryFile()
np.save(outfile_x, X)
outfile_y = TemporaryFile()
np.save(outfile_y, Y)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
else:
outfile_x.seek(0)
X = np.load(outfile_x)
outfile_y.seek(0)
Y = np.load(outfile_y)
Z = list(zip(X, Y))
shuffle(Z) # WE SHUFFLE X,Y TO PERFORM RANDOM ON THE TEST LEVEL
X, Y = zip(*Z)
X = list(X)
Y = list(Y)
lenX = len(X)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
y_test = np.asarray(Y[:4000]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_test = np.asarray(X[:4000]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_train = np.asarray(X[4000:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
y_train = np.asarray(Y[4000:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_val = x_train[-4000:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_val = y_train[-4000:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train[:-4000] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_train = y_train[:-4000] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train.reshape(np.append(x_train.shape, (1, 1))) # RESHAPE FOR INPUT
x_test = x_test.reshape(np.append(x_test.shape, (1, 1))) # RESHAPE FOR INPUT
x_val = x_val.reshape(np.append(x_val.shape, (1, 1))) # RESHAPE FOR INPUT
features_shape = x_val.shape
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
model = tf.keras.models.Sequential([
tf.keras.layers.Input(name='inputs', shape=(13, 1, 1), dtype='float32'),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', strides=1, name='block1_conv', input_shape=(13, 1, 1)),
tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same', name='block1_pool'),
tf.keras.layers.BatchNormalization(name='block1_norm'),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', strides=1, name='block2_conv',
input_shape=(13, 1, 1)),
tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block2_pool'),
tf.keras.layers.BatchNormalization(name='block2_norm'),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', strides=1, name='block3_conv',
input_shape=(13, 1, 1)),
tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block3_pool'),
tf.keras.layers.BatchNormalization(name='block3_norm'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu', name='dense'),
tf.keras.layers.BatchNormalization(name='dense_norm'),
tf.keras.layers.Dropout(0.2, name='dropout'),
tf.keras.layers.Dense(10, activation='softmax', name='pred')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
print("fitting")
history = model.fit(x_train, y_train, epochs=15, validation_data=(x_val, y_val))
print("testing")
results.append(model.evaluate(x_test, y_test)[1])
print(results)
print(sum(results)/len(results))
for i in range(10000):
f_1 = only_speakers[randint(0, len(only_speakers) - 1)]
f_2 = only_speakers[randint(0, len(only_speakers) - 1)]
if " " not in f_1.split("_")[0]:
f_speaker_1 = f_1.split("_")[0]
else:
f_speaker_1 =f_1.split("_")[0].split(" ")[0]
if " " not in f_2.split("_")[0]:
f_speaker_2 = f_2.split("_")[0]
else:
f_speaker_2 =f_2.split("_")[0].split(" ")[0]
if f_speaker_2 == f_speaker_1:
winner.append(1)
else:
winner.append(0)
print(sum(winner)/len(winner))
#]
# if onlyfiles[randint(len(onlyfiles) - 1)] == onlyfiles[randint(len(onlyfiles) - 1)]
#pyplot.plot(history.history['loss'], label='train')
#pyplot.plot(history.history['val_loss'], label='test') Q
#pyplot.legend()
#pyplot.show()
Readin your post these are the following things I could suggest you fix/explore
42% is not that impressive of an accuracy for the task you have at hand, consider the way you are cross-validating e.g. how do you split between a validation, test and training dataset
Your dataset seems very limited. Your task is to identify the speaker. A single episode might not be enough data for this task.
You might want to consider Deep Neural Network libraries such as Keras and Tensorflow. Convolutions is something you can apply directly to the MFC Graph.
If you decide using Tensorflow or Keras consider Triplet-Loss, where you preset a positive and negative example.
Consider reading the current state of the art for your task: https://github.com/grausof/keras-sincnet
Consider reading https://arxiv.org/abs/1503.03832 and adopting it for speech recognition.
The easiest thing you can do to improve your results is adding CNN layers to extract features from the MFCC

Issue training RNN model with pytorch with trivial goal

I'm trying to train a simple RNN model with a trivial goal where the output matches a fixed vector regardless of the input
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
print "i2h WEIGHT size ", list(self.i2h.weight.size())
print "i2h bias size ", list(self.i2h.bias.size())
self.i2o = nn.Linear(hidden_size, output_size)
print "i2o WEIGHT size ", list(self.i2o.weight.size())
print "i2o bias size ", list(self.i2o.bias.size())
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(hidden)
output = self.softmax(output)
return output, hidden
def initHidden(self):
return Variable(torch.zeros(1, self.hidden_size))
n_hidden = 20
rnn = RNN(10, n_hidden, 3)
learning_rate = 1e-3
loss_fn = torch.nn.MSELoss(size_average=False)
out_target = Variable( torch.FloatTensor([[0.0 , 1.0, 0.0]] ) , requires_grad=False)
print "target output::: ", out_target
def train(category_tensor, line_tensor):
hidden = rnn.initHidden()
rnn.zero_grad()
for i in range(line_tensor.size()[0]):
#print "train iteration ", i, ": input data: ", line_tensor[i]
output, hidden = rnn(line_tensor[i], hidden)
loss = loss_fn(output, out_target)
loss.backward()
# Add parameters' gradients to their values, multiplied by learning rate
for p in rnn.parameters():
#print "parameter: ", p, " gradient: ", p.grad.data
p.data.add_(-learning_rate, p.grad.data)
return output, loss.data[0]
current_loss = 0
n_iters = 500
for iter in range(1, n_iters + 1):
inp = Variable(torch.randn(100,1,10) + 5)
output, loss = train(out_target, inp)
current_loss += loss
if iter % 1 == 0:
print "weights: ",rnn.i2h.weight
print "LOSS: ", loss
print output
As it shows, the loss stays above 6 and never goes down. Notice also that I am biasing all the random inputs normal distributions by 5, so they are mostly positive numbers, so there should exist a weight distribution that approaches the goal output
What am I doing wrong in this example that is failing to output to approach the goal?
Your fixed output is:
torch.FloatTensor([[0.0, 1.0, 0.0]])
But you are using the following as the final layer in your RNN:
self.softmax = nn.LogSoftmax(dim=1)
Does LogSoftmax returns value in [0, 1]? Althouhgh, you can use the Softmax but I would recommend you to use the sign function and transform -1 to 0.

Keras LSTM RNN forecast - Shifting fitted forecast backward

I am trying to use LSTM Recurrent Neural Net using Keras to forecast future purchase. My input variables are time-window of purchases for previous 5 days, and a categorical variable which I encoded as dummy variables A, B, ...,I. My input data looks like following:
>>> dataframe.head()
day price A B C D E F G H I TS_bigHolidays
0 2015-06-16 7.031160 1 0 0 0 0 0 0 0 0 0
1 2015-06-17 10.732429 1 0 0 0 0 0 0 0 0 0
2 2015-06-18 21.312692 1 0 0 0 0 0 0 0 0 0
My problem is my forecasts/fitted values (both for trained and test data) seem to be shifted forward. Here is a plot:
My question is what parameter in LSTM Keras should I change to correct this issue? Or do I need to change anything in my input data?
Here is my code:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas
import math
import time
import csv
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from sklearn.preprocessing import MinMaxScaler
np.random.seed(1234)
exo_feature = ["A","B","C","D","E","F","G","H","I", "TS_bigHolidays"]
look_back = 5 #this is number of days we are looking back for sliding window of time series
forecast_period_length = 40
# load the dataset
dataframe = pandas.read_csv('processedDataframeGameSphere.csv', header = 0, engine='python', skipfooter=6)
dataframe["price"] = dataframe['price'].astype('float32')
scaler = MinMaxScaler(feature_range=(0, 100))
dataframe["price"] = scaler.fit_transform(dataframe['price'])
# this function is used to make sliding window for time series data
def create_dataframe(dataframe, look_back=1):
dataX, dataY = [], []
for i in range(dataframe.shape[0]-look_back-1):
price_lookback = dataframe['price'][i: (i + look_back)] #i+look_back is exclusive here
exog_feature = dataframe[exo_feature].ix[i + look_back - 1] #Y is i+ look_back ,that's why
row_i = price_lookback.append(exog_feature)
dataX.append(row_i)
dataY.append(dataframe["price"][i + look_back])
return np.array(dataX), np.array(dataY)
window_dataframe, Y = create_dataframe(dataframe, look_back)
# split into train and test sets
train_size = int(dataframe.shape[0] - forecast_period_length) #28 is the number of days we want to forecast , 4 weeks
test_size = dataframe.shape[0] - train_size
test_size_start_point_with_lookback = train_size - look_back
trainX, trainY = window_dataframe[0:train_size,:], Y[0:train_size]
print(trainX.shape)
print(trainY.shape)
#below changed datawindowY indexing, since it's just array.
testX, testY = window_dataframe[train_size:dataframe.shape[0],:], Y[train_size:dataframe.shape[0]]
# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
print(trainX.shape)
print(testX.shape)
# create and fit the LSTM network
dimension_input = testX.shape[2]
model = Sequential()
layers = [dimension_input, 50, 100, 1]
epochs = 100
model.add(LSTM(
input_dim=layers[0],
output_dim=layers[1],
return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
layers[2],
return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(
output_dim=layers[3]))
model.add(Activation("linear"))
start = time.time()
model.compile(loss="mse", optimizer="rmsprop")
print "Compilation Time : ", time.time() - start
model.fit(
trainX, trainY,
batch_size= 10, nb_epoch=epochs, validation_split=0.05,verbose =2)
# Estimate model performance
trainScore = model.evaluate(trainX, trainY, verbose=0)
trainScore = math.sqrt(trainScore)
trainScore = scaler.inverse_transform(np.array([[trainScore]]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = model.evaluate(testX, testY, verbose=0)
testScore = math.sqrt(testScore)
testScore = scaler.inverse_transform(np.array([[testScore]]))
print('Test Score: %.2f RMSE' % (testScore))
# generate predictions for training
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# shift train predictions for plotting
np_price = np.array(dataframe["price"])
print(np_price.shape)
np_price = np_price.reshape(np_price.shape[0],1)
trainPredictPlot = np.empty_like(np_price)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
testPredictPlot = np.empty_like(np_price)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+look_back+1:dataframe.shape[0], :] = testPredict
# plot baseline and predictions
plt.plot(dataframe["price"])
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()
It's not a problem of LSTM, if you use just simple feed-forward network, the effect will be the same.
the problem is the network tend to mimic yesterday value instead of 'forecasting' you expect.
(it is nice strategy in term of reducing MSE loss)
you need more 'care' to avoid this issue and it's not a simple issue.

Memory error while loading CIFAR10 training data

I was solving assignment 2 (link) of Andrej Karpathy's course on Neural network. The programming environment is ipython notebook. When i am trying to load CIFAR10 data I am repeatedly getting memory error. I tried to google any solution but nothing worked. Please help me here.
from cs231n.data_utils import load_CIFAR10
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for the two-layer neural net classifier. These are the same steps as
we used for the SVM, but condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# Subsample the data
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
# Reshape data to rows
X_train = X_train.reshape(num_training, -1)
X_val = X_val.reshape(num_validation, -1)
X_test = X_test.reshape(num_test, -1)
return X_train, y_train, X_val, y_val, X_test, y_test
# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape
import cPickle as pickle
import numpy as np
import os
def load_CIFAR_batch(filename):
""" load single batch of cifar """
with open(filename, 'rb') as f:
datadict = pickle.load(f)
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
Y = np.array(Y)
return X, Y
def load_CIFAR10(ROOT):
""" load all of cifar """
xs = []
ys = []
for b in range(1,6):
f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yte
If someone is facing the same issue on windows os please install x64 python distribution. Memory usage of x86 distribution is capped at 2GB.

Resources