Related
How can I avoid underfitting in Pytorch NeuralNetwork?
I try to predict the power consumtion of a plant based on seven features. I have built two simple neural network models.
The first one is a Linear model, and the second is a RNN model. However, both models perform bad in the test set, their forecast result is a straight line.
Something about data
There are about 360 samples in the CSV file. I take the first 300 samples for trainning and the others for test. The first 7 columns of raw data are features of daily operation. The last column is the electricity consumption of every day.
Setting of training set
In the linear model, train data is the first 7 colums of a certain day, and corresponding target is the power consumption of that day.
In the RNN model, train data is all the 8 columns of three days(seven features and power consumption), and corresponding traget is the power consumption of next three days.
Code
Code of RNN model
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
'''
build simple RNN
'''
batchSize = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
netPath = ''
'''Data processing'''
# read raw data
filePath = 'F:/.csv'
initialData = pd.read_csv(filePath)
print(initialData.head(10))
print('hello world')
# Separate features and power consumption.
trainDatas = initialData.iloc[0:7, 1:301]
trainPowerConsum = pd.DataFrame(initialData.iloc[-1, 1:301]).T
trainDatas = pd.concat([trainDatas, trainPowerConsum], 0)
trainPowerConsum = initialData.iloc[-1, 2:302]
# Plot
powerConsumPlot = trainDatas.iloc[-1, :]
xData = np.linspace(1, powerConsumPlot.shape[0], 300)
plt.plot(xData, powerConsumPlot)
plt.show()
testDatas = initialData.iloc[0:7, 302:-1]
testPowerConsum = pd.DataFrame(initialData.iloc[-1, 302:-1]).T
testDatas = pd.concat([testDatas, testPowerConsum], 0)
testPowerConsum = initialData.iloc[-1, 303:]
# convert to dataframe
trainDatas = pd.DataFrame(trainDatas)
trainDatas = trainDatas.T
trainPowerConsum = pd.DataFrame(trainPowerConsum)
testDatas = pd.DataFrame(testDatas)
testDatas = testDatas.T
testPowerConsum = pd.DataFrame(testPowerConsum)
# change the unit of PowerConsumption
trainDatas.iloc[:, -1] = trainDatas.iloc[:, -1] * 1000
testDatas.iloc[:, -1] = testDatas.iloc[:, -1] * 1000
trainPowerConsum.iloc[:, 0] = trainPowerConsum.iloc[:, 0] * 1000
testPowerConsum.iloc[:, 0] = testPowerConsum.iloc[:, 0] * 1000
assert testPowerConsum.shape[0] == testDatas.shape[0]
assert trainDatas.shape[0] == trainPowerConsum.shape[0]
# convert dataframe to tensor
trainDatas = torch.tensor(trainDatas.values.astype(float), device=device)
trainPowerConsum = torch.tensor(trainPowerConsum.values.astype(float), device=device)
testDatas = torch.tensor(testDatas.values.astype(float), device=device)
testPowerConsum = torch.tensor(testPowerConsum.values.astype(float), device=device)
trainDatasList = list()
trainPowerConsumList = list()
for i in range(298):
trainDatasList.append(trainDatas[i:i + 3])
trainPowerConsumList.append(trainPowerConsum[i:i + 3])
from torch.nn.utils.rnn import pad_sequence
trainPowerConsum = pad_sequence(trainPowerConsumList, batch_first=True)
trainDatas = pad_sequence(trainDatasList, batch_first=True)
print(trainDatas.shape)
# ensure the batch_size of test data is 1
testDatas = torch.unsqueeze(testDatas, dim=0)
testPowerConsum = torch.unsqueeze(testPowerConsum, dim=0)
'''build dataloader'''
trainDataLoader = DataLoader(
TensorDataset(
trainDatas, trainPowerConsum
),
shuffle=True, batch_size=batchSize, drop_last=True)
print('Data is ready')
seqLen = 2
inputDim = 8
hiddenSize = 3
numLayer = 2
learningRate = 0.01
class RNNModel(torch.nn.Module):
def __init__(self, inputsize, hiddensize, batchsize, numLayer):
super(RNNModel, self).__init__()
self.batchsize = batchsize
self.inputsize = inputsize
self.hiddensize = hiddensize
self.numlayers = numLayer
self.rnn = torch.nn.RNN(input_size=self.inputsize, hidden_size=self.hiddensize, num_layers=self.numlayers,
batch_first=True)
self.l1 = torch.nn.Linear(hiddenSize, hiddensize)
self.l2 = torch.nn.Linear(hiddenSize, 1)
def forward(self, input, hidden):
out, hidden = self.rnn(input.float(), hidden.float())
batch_size, seq_len, input_dim = out.shape
out = out.reshape(-1, input_dim)
# out = f.sigmoid(self.l1(out))
out = f.relu(self.l1(out))
out = self.l2(out)
out = out.reshape(batch_size, seq_len, -1)
return out, hidden
def initHidden(self):
hidden = torch.zeros(self.numlayers, self.batchsize, self.hiddensize, device=device, dtype=torch.float64)
return hidden
net = RNNModel(inputDim, hiddenSize, batchSize, numLayer).to(device)
criterion = torch.nn.L1Loss()
optimizer = optim.Adam(net.parameters(), lr=learningRate,momentum=0.01)
def train(epoch):
runLoss = 0.
optimizer.zero_grad()
hidden = net.initHidden()
for batchIndex, data in enumerate(trainDataLoader, 0):
inputs, target = data
optimizer.zero_grad()
outputs, hidden = net(inputs, hidden)
hidden = hidden.detach()
loss = criterion(outputs.float(), target.float())
loss = loss.mean()
loss.backward()
optimizer.step()
print(f'{epoch + 1},\t Loss={loss.item()}')
# torch.save(net.state_dict(), netPath)
def test():
testDatasVice = torch.clone(testDatas)
input = testDatasVice[:, 0, :]
input = input.view(1, 1, -1)
assert input.shape[2] == 8
predictPowConsum = list()
# the first hidden tensor in test set is zero
hidden = torch.zeros(2, 1, 3, device=device, dtype=torch.float64)
with torch.no_grad():
for i in range(testDatas.shape[1]):
output, hidden = net(input, hidden)
if i < 51:
testDatasVice[:, i + 1, -1] = output[0]
input = torch.unsqueeze(testDatasVice[:, i + 1, :], dim=0)
predictPowConsum.append(output.data.cpu().numpy().ravel()[0])
elif i == 51:
predictPowConsum.append(output.data.cpu().numpy().ravel()[0])
else:
print('\tindexError') # Exclude potential Errors
return predictPowConsum
if __name__ == '__main__':
epochNum = 300
for epoch in range(epochNum):
train(epoch)
predictPowConsum = test()
# plotting
xData = np.arange(303, 303 + testPowerConsum.size(1))
plt.plot(xData, testPowerConsum.cpu().numpy()[0, :, 0])
plt.plot(xData, predictPowConsum)
plt.show()
Code of Linear model
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
filePath = 'F:.csv'
initialData = pd.read_csv(filePath)
print(initialData.head(10))
print('hello world')
trainDatas = initialData.iloc[0:7, 1:300]
trainPowerConsum = initialData.iloc[-1, 1:300]
testDatas = initialData.iloc[0:7, 300:-1]
testPowerConsum = initialData.iloc[-1, 300:-1]
trainDatas = pd.DataFrame(trainDatas)
trainDatas = trainDatas.T
trainPowerConsum = pd.DataFrame(trainPowerConsum)
testDatas = pd.DataFrame(testDatas)
testDatas = testDatas.T
testPowerConsum = pd.DataFrame(testPowerConsum)
trainPowerConsum.iloc[:, 0] = trainPowerConsum.iloc[:, 0] * 1000
testPowerConsum.iloc[:, 0] = testPowerConsum.iloc[:, 0] * 1000
# build dataloader
trainData = DataLoader(
TensorDataset(
torch.tensor(trainDatas.values).float(),
torch.tensor(trainPowerConsum.values.astype(float)).float()
),
shuffle=True, batch_size=15)
testData = DataLoader(
TensorDataset(
torch.tensor(testDatas.values.astype(float)).float(),
torch.tensor(testPowerConsum.values.astype(float)).float()
),
shuffle=False, batch_size=15)
print('data is ready')
class SimpleNet(torch.nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
self.l1 = torch.nn.Linear(7, 15)
self.l2 = torch.nn.Linear(15, 30)
self.l3 = torch.nn.Linear(30, 15)
self.l4 = torch.nn.Linear(15, 5)
self.l5 = torch.nn.Linear(5, 1)
def forward(self, x):
x = f.relu(self.l1(x))
x = f.relu(self.l2(x))
x = f.relu(self.l3(x))
x = f.relu(self.l4(x))
return self.l5(x)
model = SimpleNet()
criterion = torch.nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)
def train(epoch):
runLoss = 0.
for batch_index, data in enumerate(trainData, 0):
inputs, target = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
runLoss += loss
print(f'{epoch + 1},{batch_index + 1},\tLoss={runLoss / 5}')
runLoss = 0
def test(epoch):
totalError = 0.
print('Start to test the model')
with torch.no_grad():
for data in testData:
# test ---------data for test
# testlab ---------corresponding power consumption
test, testlab = data
outputs = model(test)
predicted = outputs.data
testError = testlab - predicted
# plotting
if epoch % 50 == 2:
xData = np.linspace(1, 15, 15)
if predicted.size(0) != 15:
pass
else:
plt.plot(xData, predicted[:, 0].numpy(), label='predicted', color='red')
plt.plot(xData, testlab[:, 0].numpy(), label='origData', color='blue')
plt.show()
totalError += (torch.abs(testError).sum().item())
print(f'Average Error on test set is {totalError / 54}')
if __name__ == '__main__':
for epoch in range(1000):
train(epoch)
test(epoch)
Image of Output
output of RNN
The blue line is the actual data, and the orange line is the output of RNN model.
Solutions and its Effect
I have looked around and apparently I've got the choice between these solutions:
Add new domain-specific features
Decrease the amount of regularization used
Increase the duration of training
Increase the complexity or type of the model
Decrease the learning rate
Try other activate function
I have tried some solutions:
The data for trainning isn't regularized. I just change the unit of electricity from kWh to Wh
I take ReLu as activate function after using Sigmoid, but it doesn't work
I adjust the learning rate from 0.01 to 0.001, it doesn't improve
I try different optimizer such as SGD and Adam on both model, even use momentum, it doesn't get better
The sequence length of RNN model is 60 firstly, then is set to 3. The loss dropped more rapidly in the latter case, but the forecast result still is a straight line
In a word, all solutions I find doesn't work.
Besides, if shuffle is True when building DataLoader, the loss skips violently between epochs. But it drops slowly and close to an constant eventually when shuffle is False.
What could be the best way to avoid the problem?
Thanks in advance!
I built a two layered LSTM model(keras model) for a movie review dataset from kaggle : Dataset
While training the model, every epoch was giving the same accuracy of 0.5098.
Then I thought it might not be learning the long distance dependencies.Then instead of LSTM I used bidirectional LSTM. But, still model's accuracy while training was 0.5098 for every epoch. I trained the model for 8 hours/35 epochs on CPU. Then I stopped training.
Code:
import pandas as pd
from sentiment_utils import *
import keras
import keras.backend as k
import numpy as np
train_data = pd.read_table('train.tsv')
X_train = train_data.iloc[:,2]
Y_train = train_data.iloc[:,3]
from sklearn.preprocessing import OneHotEncoder
Y_train = Y_train.reshape(Y_train.shape[0],1)
ohe = OneHotEncoder(categorical_features=[0])
Y_train = ohe.fit_transform(Y_train).toarray()
maxLen = len(max(X_train, key=len).split())
words_to_index, index_to_words, word_to_vec_map = read_glove_vectors("glove/glove.6B.50d.txt")
m = X_train.shape[0]
def read_glove_vectors(path):
with open(path, encoding='utf8') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
cur_word = line[0]
words.add(cur_word)
word_to_vec_map[cur_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
def sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys):
m = X_train.shape[0]
X_indices = np.zeros((m, maxLen))
for i in range(m):
if i in dash_index_list:
continue
sentance_words = X_train[i].lower().strip().split()
j = 0
for word in sentance_words:
if word in keys:
X_indices[i, j] = words_to_index[word]
j += 1
return X_indices
def pretrained_embedding_layer(word_to_vec_map, words_to_index):
emb_dim = word_to_vec_map['pen'].shape[0]
vocab_size = len(words_to_index) + 1
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, index in words_to_index.items():
emb_matrix[index, :] = word_to_vec_map[word]
emb_layer= keras.layers.embeddings.Embedding(vocab_size, emb_dim, trainable= False)
emb_layer.build((None,))
emb_layer.set_weights([emb_matrix])
return emb_layer
def get_model(input_shape, word_to_vec_map, words_to_index):
sentance_indices = keras.layers.Input(shape = input_shape, dtype='int32')
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
embeddings = embedding_layer(sentance_indices)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(embeddings)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Dense(5)(X)
X = keras.layers.Activation('softmax')(X)
model = keras.models.Model(sentance_indices, X)
return model
model = get_model((maxLen,), word_to_vec_map,words_to_index)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
dash_index_list = []
for i in range(m):
if '-' in X_train[i]:
dash_index_list.append(i)
keys = []
for key in word_to_vec_map.keys():
keys.append(key)
X_train_indices = sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys)
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32, shuffle=True)
I think the way you defined the model architecture doesn't make sense! Try looking at this example on IMDB movie reviews with LSTM on Keras github repo: Trains an LSTM model on the IMDB sentiment classification task.
I'm using Keras with the TensorFlow backend. I've just figured out how to train and classify sequences of different lengths without masking, because I can't get masking to work. In the toy example I'm working with, I'm trying to train an LSTM to detect whether a sequence of arbitrary length starts with a 1 or not.
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np
def gen_sig(num_samples, seq_len):
one_indices = np.random.choice(a=num_samples, size=num_samples // 2, replace=False)
x_val = np.zeros((num_samples, seq_len), dtype=np.bool)
x_val[one_indices, 0] = 1
y_val = np.zeros(num_samples, dtype=np.bool)
y_val[one_indices] = 1
return x_val, y_val
N_train = 100
N_test = 10
recall_len = 20
X_train, y_train = gen_sig(N_train, recall_len)
X_test, y_test = gen_sig(N_train, recall_len)
print('Build STATEFUL model...')
model = Sequential()
model.add(LSTM(10, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
for epoch in range(15):
mean_tr_acc = []
mean_tr_loss = []
for seq_idx in range(X_train.shape[0]):
start_val = X_train[seq_idx, 0]
assert y_train[seq_idx] == start_val
assert tuple(np.nonzero(X_train[seq_idx, :]))[0].shape[0] == start_val
y_in = np.array([y_train[seq_idx]], dtype=np.bool)
for j in range(np.random.choice(a=np.arange(5, recall_len+1))):
x_in = np.array([[[X_train[seq_idx][j]]]])
tr_loss, tr_acc = model.train_on_batch(x_in, y_in)
mean_tr_acc.append(tr_acc)
mean_tr_loss.append(tr_loss)
model.reset_states()
print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
print('loss training = {}'.format(np.mean(mean_tr_loss)))
print('___________________________________')
mean_te_acc = []
mean_te_loss = []
for seq_idx in range(X_test.shape[0]):
start_val = X_test[seq_idx, 0]
assert y_test[seq_idx] == start_val
assert tuple(np.nonzero(X_test[seq_idx, :]))[0].shape[0] == start_val
y_in = np.array([y_test[seq_idx]], dtype=np.bool)
for j in range(np.random.choice(a=np.arange(5, recall_len+1))):
te_loss, te_acc = model.test_on_batch(np.array([[[X_test[seq_idx][j]]]], dtype=np.bool), y_in)
mean_te_acc.append(te_acc)
mean_te_loss.append(te_loss)
model.reset_states()
print('accuracy testing = {}'.format(np.mean(mean_te_acc)))
print('loss testing = {}'.format(np.mean(mean_te_loss)))
print('___________________________________')
As seen in the code, my error is being batched over each time-step. This is bad for multiple reasons. How do I train the network in two steps? For example:
Run a bunch of values through the network to accumulate the error
Adjust the weights of the network given this accumulated error
To do what is described in the original question, the easiest way is to train the original network with masking, but then test with a stateful network so any length input can be classified:
import numpy as np
np.random.seed(1)
import tensorflow as tf
tf.set_random_seed(1)
from keras import models
from keras.layers import Dense, Masking, LSTM
import matplotlib.pyplot as plt
def stateful_model():
hidden_units = 256
model = models.Sequential()
model.add(LSTM(hidden_units, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
return model
def train_rnn(x_train, y_train, max_len, mask):
epochs = 10
batch_size = 200
vec_dims = 1
hidden_units = 256
in_shape = (max_len, vec_dims)
model = models.Sequential()
model.add(Masking(mask, name="in_layer", input_shape=in_shape,))
model.add(LSTM(hidden_units, return_sequences=False))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
validation_split=0.05)
return model
def gen_train_sig_cls_pair(t_stops, num_examples, mask):
x = []
y = []
max_t = int(np.max(t_stops))
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, max_t), dtype=np.int8)
sig[one_indices, 0] = 1
sig[:, t_stop:] = mask
x.append(sig)
cls = np.zeros(num_examples, dtype=np.bool)
cls[one_indices] = 1
y.append(cls)
return np.concatenate(x, axis=0), np.concatenate(y, axis=0)
def gen_test_sig_cls_pair(t_stops, num_examples):
x = []
y = []
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, t_stop), dtype=np.bool)
sig[one_indices, 0] = 1
x.extend(list(sig))
cls = np.zeros((num_examples, t_stop), dtype=np.bool)
cls[one_indices] = 1
y.extend(list(cls))
return x, y
if __name__ == '__main__':
noise_mag = 0.01
mask_val = -10
signal_lengths = (10, 15, 20)
x_in, y_in = gen_train_sig_cls_pair(signal_lengths, 10, mask_val)
mod = train_rnn(x_in[:, :, None], y_in, int(np.max(signal_lengths)), mask_val)
testing_dat, expected = gen_test_sig_cls_pair(signal_lengths, 3)
state_mod = stateful_model()
state_mod.set_weights(mod.get_weights())
res = []
for s_i in range(len(testing_dat)):
seq_in = list(testing_dat[s_i])
seq_len = len(seq_in)
for t_i in range(seq_len):
res.extend(state_mod.predict(np.array([[[seq_in[t_i]]]])))
state_mod.reset_states()
fig, axes = plt.subplots(2)
axes[0].plot(np.concatenate(testing_dat), label="input")
axes[1].plot(res, "ro", label="result", alpha=0.2)
axes[1].plot(np.concatenate(expected, axis=0), "bo", label="expected", alpha=0.2)
axes[1].legend(bbox_to_anchor=(1.1, 1))
plt.show()
So I tried implementing a Convolutional Neural Network on MNIST dataset in a similar fashion as this: https://github.com/tensorflow/tensorflow/blob/r1.1/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
However, on doing that, I noticed that for some reason my second max_pool is not happening. Also, I don't understand how the code in the above link works, more specifically, how the nn_layer method can be reused as the weights exist only in that scope and calling it twice would change them?
My code:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
from tensorflow.contrib.tensorboard.plugins import projector
current_path = os.path.dirname(os.path.realpath(__file__))
current_path = current_path+"/logs"
def train():
mnist = input_data.read_data_sets("MNIST_data", one_hot = True)
def initializer(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def conv2d(x,W):
return tf.nn.conv2d(x , W , [1,1,1,1] , padding="SAME")
def max_pool(x):
return tf.nn.max_pool(x , [1,2,2,1] , [1,2,2,1] , padding="SAME")
def conv_layer(x,length,width,input_channels,output_channels,layer_name,act=tf.nn.relu):
with tf.name_scope(layer_name):
with tf.name_scope('weights'):
weights = initializer([length,width,input_channels,output_channels])
tf.summary.histogram(layer_name+"_weights",weights)
with tf.name_scope('biases'):
biases = initializer([output_channels])
tf.summary.histogram(layer_name+"_biases",biases)
with tf.name_scope('activations'):
activations = act(conv2d(x,weights) + biases)
activations = max_pool(activations)
tf.summary.histogram(layer_name+"_activations",activations)
return activations
def dense_layer(x,input_size,output_size,layer_name,act=tf.nn.relu):
with tf.name_scope(layer_name):
with tf.name_scope('weights'):
weights = initializer([input_size,output_size])
tf.summary.histogram(layer_name+"_weights",weights)
with tf.name_scope('biases'):
biases = initializer([output_size])
tf.summary.histogram(layer_name+"_biases",biases)
with tf.name_scope('activations'):
activations = act(tf.matmul(x,weights) + biases)
tf.summary.histogram(layer_name+"_activations",activations)
return activations
def dropout(x,keep_prob):
with tf.name_scope('Dropout'):
dropped =tf.nn.dropout(x,keep_prob)
return dropped
with tf.name_scope('input'):
x = tf.placeholder(tf.float32, [None,784],name='image_inputs')
y = tf.placeholder(tf.float32, [None,10],name='image_labels')
keep_prob = tf.placeholder(tf.float32,name='keep_probability')
with tf.name_scope('input_reshape'):
x_image = tf.reshape(x , [-1,28,28,1])
tf.summary.image('input',x_image,50)
h1 = conv_layer(x_image,3,3,1,32,"first_convolution_layer")
h2 = conv_layer(h1,3,3,32,64,"second_convolution_layer")
h2 = tf.reshape(h1,[-1,7*7*64])
h2 = dropout(h2,keep_prob)
h3 = dense_layer(h2,7*7*64,1024,"first_dense_layer")
h3 = dropout(h3,keep_prob)
h4 = dense_layer(h3,1024,1024,"second_dense_layer")
h4 = dropout(h4,keep_prob)
h_out = dense_layer(h4,1024,10,"output_dense_layer",act=tf.nn.sigmoid)
with tf.name_scope("Loss"):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=h_out))
tf.summary.scalar('Loss',cost)
train = tf.train.AdamOptimizer().minimize(cost)
with tf.name_scope("Accuracy"):
correct_pred = tf.equal(tf.argmax(h_out, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
summary = tf.summary.merge_all()
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(current_path, sess.graph)
for i in range(500):
batch = mnist.train.next_batch(500)
if(i%100 == 0):
summary_str = sess.run(summary,feed_dict={x:batch[0], y:batch[1], keep_prob:1.0})
summary_writer.add_summary(summary_str, i)
summary_writer.flush()
train_accuracy = accuracy.eval(feed_dict={x:batch[0], y:batch[1], keep_prob:1.0})
saver.save(sess, os.path.join(current_path,'model.ckpt'), i)
print("Step %d Training Accuracy: %f" %((i/100 + 1), train_accuracy))
train.run(feed_dict={x:batch[0], y:batch[1], keep_prob:0.5})
sum=0.0
for i in range(10):
batch_x = mnist.test.images[(i*1000):((i+1)*1000)-1]
batch_y = mnist.test.labels[(i*1000):((i+1)*1000)-1]
sum = sum + accuracy.eval(feed_dict={x:batch_x, y:batch_y, keep_prob:1.0})
print("Test Accuracy: %f" %(sum/10.0))
if tf.gfile.Exists(current_path):
tf.gfile.DeleteRecursively(current_path)
tf.gfile.MakeDirs(current_path)
train()
This is a simple typo.
Change this
h2 = tf.reshape(h1,[-1,7*7*64])
to this
h2 = tf.reshape(h2,[-1,7*7*64])
The error
InvalidArgumentError (see above for traceback): logits and labels must be same size: logits_size=[1000,10] labels_size=[500,10]
[[Node: Loss/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](Loss/Reshape, Loss/Reshape_1)]]
went away.
I am new to tensorflow and I am training a neural network for all 36 characters (0-9 and a-z).
I converted a few images do tfrecords using:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os
import cv2
import tensorflow as tf
tf.app.flags.DEFINE_string('directory', '/root/data2',
'Directory to download data files and write the '
'converted result')
FLAGS = tf.app.flags.FLAGS
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def le_imagens(aux_folder):
cont=0
img=np.empty([1,28,28,1])
for letter in os.listdir(aux_folder):
folder=aux_folder+letter+"/"
for imagem in os.listdir(folder):
os.chdir(folder)
img_temp=cv2.imread(imagem)
img_temp = cv2.cvtColor(img_temp,cv2.COLOR_BGR2GRAY)
img_temp= np.expand_dims(img_temp, axis=0)
img_temp= np.expand_dims(img_temp, axis=3)
img=np.vstack((img,img_temp))
cont=cont+1
print (cont)
print (img.shape)
return img
def calcula_label(letter):
aux_label=["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","1","2","3","4","5","6","7","8","9","0"]
label= np.zeros([1,36])
cont=0
for let in aux_label:
if let==letter:
label[0,cont]=1
else:
label[0,cont]=0
cont=cont+1
return label
def cria_array_labels(aux_folder):
cont=0
lab=np.empty([1,36])
for letter in os.listdir(aux_folder):
lab_temp=calcula_label(letter)
folder=aux_folder+letter+"/"
for imagem in os.listdir(folder):
lab=np.vstack((lab,lab_temp))
cont=cont+1
print (cont)
print (lab.shape)
return lab
def convert_to(images, labels, name):
#identifica quantidade de imagens e labels
num_examples = labels.shape[0]
if images.shape[0] != num_examples:
raise ValueError("Images size %d does not match label size %d." %
(images.shape[0], num_examples))
#pega parametros da imagem
rows = images.shape[1]
cols = images.shape[2]
depth = 1
#cria nome do arquivo de saida-acho que todas as imagens vao ser escritas aqui
filename = os.path.join(FLAGS.directory, name + '.tfrecords')
print('Writing', filename)
writer = tf.python_io.TFRecordWriter(filename)
#faz um loop para cada uma das imagens
for index in range(num_examples):
#converte a imagem para string
image_raw = images[index].tostring()
labels_raw = labels[index].tostring()
#aloca no exemplo as dimensoes da img, o label e a imagem convertida
example = tf.train.Example(features=tf.train.Features(feature={
'height': _int64_feature(rows),
'width': _int64_feature(cols),
'depth': _int64_feature(depth),
'label': _bytes_feature(labels_raw),
'image_raw': _bytes_feature(image_raw)}))
#escreve o exemplo
writer.write(example.SerializeToString())
writer.close()
def main(argv):
#Train
aux_folder="/root/captchas/captchas_lft/"
img_treino=le_imagens(aux_folder)
lab_treino=cria_array_labels(aux_folder)
print ("Base de Treino Preparada")
#Cross Validation
aux_folder="/root/captchas/captchas_lfcv/"
img_cv=le_imagens(aux_folder)
lab_cv=cria_array_labels(aux_folder)
print ("Base de CV Preparada")
#Test Set
aux_folder="/root/captchas/captchas_lfts/"
img_ts=le_imagens(aux_folder)
lab_ts=cria_array_labels(aux_folder)
print ("Base de Teste Preparada")
convert_to(img_treino, lab_treino, 'train')
convert_to(img_cv, lab_cv, 'validation')
convert_to(img_ts, lab_ts, 'test')
if __name__ == '__main__':
tf.app.run()
And I am feeding the following network (which is an adaptation of MNIST2 Tensorflow tutorial):
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path
import time
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import mnist
# Basic model parameters as external flags.
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('num_epochs', 2, 'Number of epochs to run trainer.')
flags.DEFINE_integer('batch_size', 100, 'Batch size.')
flags.DEFINE_string('train_dir', '/root/data', 'Directory with the training data.')
#flags.DEFINE_string('train_dir', '/root/data2', 'Directory with the training data.')
# Constants used for dealing with the files, matches convert_to_records.
TRAIN_FILE = 'train.tfrecords'
VALIDATION_FILE = 'validation.tfrecords'
# Set-up dos pacotes
sess = tf.InteractiveSession()
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
dense_keys=['image_raw', 'label'],
# Defaults are not specified since both keys are required.
dense_types=[tf.string, tf.string])
# Convert from a scalar string tensor (whose single string has
# length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
# [mnist.IMAGE_PIXELS].
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([784])
#print (mnist.IMAGE_PIXELS)
# OPTIONAL: Could reshape into a 28x28 image and apply distortions
# here. Since we are not applying any distortions in this
# example, and the next step expects the image to be flattened
# into a vector, we don't bother.
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
# Convert label from a scalar uint8 tensor to an int32 scalar.
label = tf.cast(features['label'], tf.float32)
#print (label)
#label.set_shape([1])
#print (label)
return image, label
def inputs(train, batch_size, num_epochs):
"""Reads input data num_epochs times.
Args:
train: Selects between the training (True) and validation (False) data.
batch_size: Number of examples per returned batch.
num_epochs: Number of times to read the input data, or 0/None to
train forever.
Returns:
A tuple (images, labels), where:
* images is a float tensor with shape [batch_size, 30,26,1]
in the range [-0.5, 0.5].
* labels is an int32 tensor with shape [batch_size] with the true label,
a number in the range [0, char letras).
Note that an tf.train.QueueRunner is added to the graph, which
must be run using e.g. tf.train.start_queue_runners().
"""
if not num_epochs: num_epochs = None
filename = os.path.join(FLAGS.train_dir,
TRAIN_FILE if train else VALIDATION_FILE)
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=num_epochs)
# Even when reading in multiple threads, share the filename
# queue.
image, label = read_and_decode(filename_queue)
# Shuffle the examples and collect them into batch_size batches.
# (Internally uses a RandomShuffleQueue.)
# We run this in two threads to avoid being a bottleneck.
images, sparse_labels = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=2,
capacity=1000 + 3 * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=1000)
return images, sparse_labels
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
#Variaveis
x, y_ = inputs(train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs)
#y_ = tf.string_to_number(y_, out_type=tf.int32)
teste=tf.convert_to_tensor(y_)
#Layer 1
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
x_image = tf.reshape(x, [-1,28,28,1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
#Layer 2
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
#Densely Connected Layer
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
#Dropout - reduz overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#Readout layer
W_fc2 = weight_variable([1024, 36])
b_fc2 = bias_variable([36])
y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
#y_conv=tf.cast(y_conv, tf.int32)
#Train and evaluate
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in range(20000):
if i%100 == 0:
train_accuracy = accuracy.eval(feed_dict={keep_prob: 1.0})
print("step %d, training accuracy %g"%(i, train_accuracy))
train_step.run(feed_dict={keep_prob: 0.5})
x, y_ = inputs(train=True, batch_size=2000)
#y_ = tf.string_to_number(y_, out_type=tf.int32)
print("test accuracy %g"%accuracy.eval(feed_dict={keep_prob: 1.0}))
coord.join(threads)
sess.close()
But whe it comes to
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
The following error appears:
ValueError: Incompatible shapes for broadcasting: TensorShape([Dimension(100)]) and TensorShape([Dimension(100), Dimension(36)])
I think the problem is de label tensor, but i am not sure how to fix it in order to have a (None,36) dimension. Does anyone knows how to solve this issue?
Thanks
Marcelo
The simplest way to make this work is to replace y_ with a one-hot encoding using tf.one_hot():
onehot_y_ = tf.one_hot(y_, 36, dtype=tf.float32)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(onehot_y_ * tf.log(y_conv),
reduction_indices=[1]))
An alternative would be to switch to using the specialized op tf.nn.sparse_softmax_cross_entropy_with_logits(), which is more efficient and more numerically stable. To use it, you'll have to remove the call to tf.nn.softmax() in the definition of y_conv:
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(y_conv, y_))