Tensorflow: low loss resulting in low accuracy - machine-learning

I am trying to use Tensorflow and build a simple logistic regression classifier. The training set consists of around half a million (480, 000) data points. I am training on batches of size 100.
When training the model the loss falls from 0.002 to 0.000000034 within 20 training epochs ( the whole training set is traversed batch by batch in each epoch).
However this gives an accuracy of 0.671725 on the training set, which seems very low. To my understanding the accuracy on the training set should be near 1.00 (100%). As I am quite new to Tensorflow I am not sure if I am missing a crucial detail in my calculations.
Changing the batch size to 1000 does not make much of a difference - from 0.671698 to 0.671725.
Could be overfitting, but I still can't understand why it lowers the accuracy on the training set as well.
Decreasing the epochs to 10 gives the same accuracy.
Increasing the epochs to 100 does not change the accuracy.
Increasing the learning rate from 0.001 to 0.01 yields 0.000000000 after 75 epochs. The value is very small to fit in the precision. The training accuracy stays the same
Code snippet:
x = tf.placeholder(tf.float32, [None, window_size]) # data with the size of the window currently set to 6
y = tf.placeholder(tf.float32, [None, 2])
W = tf.Variable(tf.zeros([window_size, 2]))
b = tf.Variable(tf.zeros([2]))
# Construct model
model = tf.nn.sigmoid(tf.matmul(x, W) + b) #Sigmoid
cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(model), reduction_indices=1))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# Inirializing the variables
init = tf.initialize_all_variables()
print (len(feed_windows_np))
batch_size = 1000
#Splitting to train and test
train = feed_windows_np[:len(feed_windows_np)//2 - ((len(feed_windows_np)//2) % batch_size)]
train_labels = labels[:len(feed_windows_np)//2 - ((len(feed_windows_np)//2) % batch_size)]
test = feed_windows_np[(len(feed_windows_np) * 2) // 3:]
test_labels = labels[(len(feed_windows_np) * 2) // 3:]
total_batches = int(math.floor(len(train) / batch_size))
avg_costs = []
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(epochs):
avg_cost = 0.
for i in range(total_batches):
feed_dict = {
x: train[i * batch_size:(i + 1) * batch_size],
y: labels[i * batch_size:(i + 1) * batch_size]
}
_, c = sess.run([optimizer, cost], feed_dict=feed_dict)
#sess.run(optimizer, feed_dict=feed_dict)
# updating the loss
avg_cost += c / total_batches
if (epoch + 1) % display_step == 0:
print ('Epoch:', '%04d' % (epoch + 1), 'cost=', '{:.9f}'.format(avg_cost))
avg_costs.append(avg_cost)
print ('Optimisation Finished!')
plt.plot(avg_costs)
plt.show()
# Training accuracy
correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
accuracy_val= sess.run(accuracy, feed_dict={x: train, y: train_labels})
print ('Accuracy on training data:', accuracy_val)
Any idea what could be causing this behaviour? Is it the size that is big, because when going down to 80 training data points the accuracy gets to one as it should be?

Related

Facing this error while classifying Images, containing 10 classes in pytorch, in ResNet50. My code is:

This is the code I am implementing: I am using a subset of the CalTech256 dataset to classify images of 10 different kinds of animals. We will go over the dataset preparation, data augmentation and then steps to build the classifier.
def train_and_validate(model, loss_criterion, optimizer, epochs=25):
'''
Function to train and validate
Parameters
:param model: Model to train and validate
:param loss_criterion: Loss Criterion to minimize
:param optimizer: Optimizer for computing gradients
:param epochs: Number of epochs (default=25)
Returns
model: Trained Model with best validation accuracy
history: (dict object): Having training loss, accuracy and validation loss, accuracy
'''
start = time.time()
history = []
best_acc = 0.0
for epoch in range(epochs):
epoch_start = time.time()
print("Epoch: {}/{}".format(epoch+1, epochs))
# Set to training mode
model.train()
# Loss and Accuracy within the epoch
train_loss = 0.0
train_acc = 0.0
valid_loss = 0.0
valid_acc = 0.0
for i, (inputs, labels) in enumerate(train_data_loader):
inputs = inputs.to(device)
labels = labels.to(device)
# Clean existing gradients
optimizer.zero_grad()
# Forward pass - compute outputs on input data using the model
outputs = model(inputs)
# Compute loss
loss = loss_criterion(outputs, labels)
# Backpropagate the gradients
loss.backward()
# Update the parameters
optimizer.step()
# Compute the total loss for the batch and add it to train_loss
train_loss += loss.item() * inputs.size(0)
# Compute the accuracy
ret, predictions = torch.max(outputs.data, 1)
correct_counts = predictions.eq(labels.data.view_as(predictions))
# Convert correct_counts to float and then compute the mean
acc = torch.mean(correct_counts.type(torch.FloatTensor))
# Compute total accuracy in the whole batch and add to train_acc
train_acc += acc.item() * inputs.size(0)
#print("Batch number: {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}".format(i, loss.item(), acc.item()))
# Validation - No gradient tracking needed
with torch.no_grad():
# Set to evaluation mode
model.eval()
# Validation loop
for j, (inputs, labels) in enumerate(valid_data_loader):
inputs = inputs.to(device)
labels = labels.to(device)
# Forward pass - compute outputs on input data using the model
outputs = model(inputs)
# Compute loss
loss = loss_criterion(outputs, labels)
# Compute the total loss for the batch and add it to valid_loss
valid_loss += loss.item() * inputs.size(0)
# Calculate validation accuracy
ret, predictions = torch.max(outputs.data, 1)
correct_counts = predictions.eq(labels.data.view_as(predictions))
# Convert correct_counts to float and then compute the mean
acc = torch.mean(correct_counts.type(torch.FloatTensor))
# Compute total accuracy in the whole batch and add to valid_acc
valid_acc += acc.item() * inputs.size(0)
#print("Validation Batch number: {:03d}, Validation: Loss: {:.4f}, Accuracy: {:.4f}".format(j, loss.item(), acc.item()))
# Find average training loss and training accuracy
avg_train_loss = train_loss/train_data_size
avg_train_acc = train_acc/train_data_size
# Find average training loss and training accuracy
avg_valid_loss = valid_loss/valid_data_size
avg_valid_acc = valid_acc/valid_data_size
history.append([avg_train_loss, avg_valid_loss, avg_train_acc, avg_valid_acc])
epoch_end = time.time()
print("Epoch : {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}%, \n\t\tValidation : Loss : {:.4f}, Accuracy: {:.4f}%, Time: {:.4f}s".format(epoch, avg_train_loss, avg_train_acc*100, avg_valid_loss, avg_valid_acc*100, epoch_end-epoch_start))
# Save if the model has best accuracy till now
torch.save(model, dataset+'_model_'+str(epoch)+'.pt')
return model, history
# Load pretrained ResNet50 Model
resnet50 = models.resnet50(pretrained=True)
#resnet50 = resnet50.to('cuda:0')
# Freeze model parameters
for param in resnet50.parameters():
param.requires_grad = False
# Change the final layer of ResNet50 Model for Transfer Learning
fc_inputs = resnet50.fc.in_features
resnet50.fc = nn.Sequential(
nn.Linear(fc_inputs, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, num_classes), # Since 10 possible outputs
nn.LogSoftmax(dim=1) # For using NLLLoss()
)
# Convert model to be used on GPU
# resnet50 = resnet50.to('cuda:0')
# Change the final layer of ResNet50 Model for Transfer Learning
fc_inputs = resnet50.fc.in_features
resnet50.fc = nn.Sequential(
nn.Linear(fc_inputs, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, num_classes), # Since 10 possible outputs
nn.LogSoftmax(dienter code herem=1) # For using NLLLoss()
)
# Convert model to be used on GPU
# resnet50 = resnet50.to('cuda:0')`enter code here`
Error is this:
RuntimeError Traceback (most recent call
last) in ()
6 # Train the model for 25 epochs
7 num_epochs = 30
----> 8 trained_model, history = train_and_validate(resnet50, loss_func, optimizer, num_epochs)
9
10 torch.save(history, dataset+'_history.pt')
in train_and_validate(model,
loss_criterion, optimizer, epochs)
43
44 # Compute loss
---> 45 loss = loss_criterion(outputs, labels)
46
47 # Backpropagate the gradients
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in
call(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\loss.py in
forward(self, input, target)
202
203 def forward(self, input, target):
--> 204 return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
205
206
~\Anaconda3\lib\site-packages\torch\nn\functional.py in
nll_loss(input, target, weight, size_average, ignore_index, reduce,
reduction) 1836 .format(input.size(0),
target.size(0))) 1837 if dim == 2:
-> 1838 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index) 1839 elif dim == 4: 1840 ret = torch._C._nn.nll_loss2d(input, target,
weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes'
failed. at
C:\Users\builder\AppData\Local\Temp\pip-req-build-0i480kur\aten\src\THNN/generic/ClassNLLCriterion.c:97
This happens when there are either incorrect labels in your dataset, or the labels are 1-indexed (instead of 0-indexed). As from the error message, cur_target must be smaller than the total number of classes (10). To verify the issue, check the maximum and minimum label in your dataset. If the data is indeed 1-indexed, just minus one from all annotations and you should be fine.
Note, another possible reason is that there exists some -1 labels in the data. Some (esp older) datasets use -1 as indication of a wrong/dubious label. If you find such labels, just discard them.

How to include batch size in pytorch basic example?

I am new to pytorch. The following is the basic example of using nn module to train a simple one-layer model with some random data (from here)
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for t in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(t, loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
As far as I understand, the batch size is equal to 1 in the example, in other words, a single point (out of 64) is used to calculate gradients and update parameters. My question is: how to modify this example to train the model with the batch size greater than one?
In fact N is the batch size. So you just need to modify N currently its set to 64. So you have in every training batch 64 vectors with size / dim D_in.
I checked the link you posted, you can also take a look at the comments - there is some explanation too :)
# -*- coding: utf-8 -*-
import numpy as np
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)
# Compute and print loss
loss = np.square(y_pred - y).sum()
print(t, loss)
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0
grad_w1 = x.T.dot(grad_h)
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
To include batch size in PyTorch basic examples, the easiest and cleanest way is to use PyTorch torch.utils.data.DataLoader and torch.utils.data.TensorDataset.
Dataset stores the samples and their corresponding labels, and DataLoader wraps an iterable around the Dataset to enable easy access to the samples.
DataLoader will take care of creating batches for you.
Building on your question, there is a complete code snippet, where we iterate over a dataset of 10000 examples for 2 epochs with a batch size of 64:
import torch
from torch.utils.data import DataLoader, TensorDataset
# Create the dataset with N_SAMPLES samples
N_SAMPLES, D_in, H, D_out = 10000, 1000, 100, 10
x = torch.randn(N_SAMPLES, D_in)
y = torch.randn(N_SAMPLES, D_out)
# Define the batch size and the number of epochs
BATCH_SIZE = 64
N_EPOCHS = 2
# Use torch.utils.data to create a DataLoader
# that will take care of creating batches
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
# Define model, loss and optimizer
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# Get the dataset size for printing (it is equal to N_SAMPLES)
dataset_size = len(dataloader.dataset)
# Loop over epochs
for epoch in range(N_EPOCHS):
print(f"Epoch {epoch + 1}\n-------------------------------")
# Loop over batches in an epoch using DataLoader
for id_batch, (x_batch, y_batch) in enumerate(dataloader):
y_batch_pred = model(x_batch)
loss = loss_fn(y_batch_pred, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Every 100 batches, print the loss for this batch
# as well as the number of examples processed so far
if id_batch % 100 == 0:
loss, current = loss.item(), (id_batch + 1)* len(x_batch)
print(f"loss: {loss:>7f} [{current:>5d}/{dataset_size:>5d}]")
The output should be something like:
Epoch 1
-------------------------------
loss: 643.433716 [ 64/10000]
loss: 648.195435 [ 6464/10000]
Epoch 2
-------------------------------
loss: 613.619873 [ 64/10000]
loss: 625.018555 [ 6464/10000]

In simple multi-layer FFNN only ReLU activation function doesn't converge

I'm learning tensorflow, deep learning and experimenting various kinds of activation functions.
I created a multi-layer FFNN for the MNIST problem. Mostly based on the tutorial from the official tensorflow website, except that 3 hidden layers were added.
The activation functions I have experimented are: tf.sigmoid, tf.nn.tanh, tf.nn.softsign, tf.nn.softmax, tf.nn.relu. Only tf.nn.relu doesn't converge, the network output random noise (testing accuracy is about 10%). The following are my source code:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x = tf.placeholder(tf.float32, [None, 784])
W0 = tf.Variable(tf.random_normal([784, 200]))
b0 = tf.Variable(tf.random_normal([200]))
hidden0 = tf.nn.relu(tf.matmul(x, W0) + b0)
W1 = tf.Variable(tf.random_normal([200, 200]))
b1 = tf.Variable(tf.random_normal([200]))
hidden1 = tf.nn.relu(tf.matmul(hidden0, W1) + b1)
W2 = tf.Variable(tf.random_normal([200, 200]))
b2 = tf.Variable(tf.random_normal([200]))
hidden2 = tf.nn.relu(tf.matmul(hidden1, W2) + b2)
W3 = tf.Variable(tf.random_normal([200, 10]))
b3 = tf.Variable(tf.random_normal([10]))
y = tf.matmul(hidden2, W3) + b3
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
for _ in range(10000):
batch_xs, batch_ys = mnist.train.next_batch(128)
session.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
if _ % 1000 == 0:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(_, session.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('final:', session.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
The code outputs something like this:
0 0.098
1000 0.098
2000 0.098
3000 0.098
4000 0.098
5000 0.098
6000 0.098
7000 0.098
8000 0.098
9000 0.098
final: 0.098
If tf.nn.relu is replaced with other activation functions the network accuracy improves gradually (with different final accuracy though), which is expected.
I have read in may textbooks/tutorials that ReLU should be the first candidate as activation function.
My question is why ReLU doesn't work in my network? or my program is simply wrong ?
You are using the Relu activation function that computes the activation as follows,
max(features, 0)
Since it outputs the max value, this sometimes causes the exploding gradient.
Gradientdecnt optimizer update the weight via the following,
∆wij = −η
∂Ei/
∂wij
where η is the learning rate and ∂Ei/∂wij is the partial derivation of the loss w.r.t weight. When max values gets larger and larger, partial derivations also gets larger and causes the exploding gradient. Therefore, as you can observe in the equation, you need to tune the learning rate (η) to overcome this situation.
A common rule is to reduce the learning rate, usually by a factor of 10 each time.
For your case, set the learning rate = 0.001 and will improve the accuracy.
Hope this helps.

TensorFlow learning rate decay - how to properly supply the step number for decay?

I am training my deep network in TensorFlow and I am trying to use a learning rate decay with it. As far as I see I should use train.exponential_decay function for that - it will calculate the proper learning rate value for current training step using various parameters. I just need to provide it with a step which is performed right now. I suspected I should use tf.placeholder(tf.int32) as usual when I need to provide something into the network, but seems like I am wrong. When I do this I get the below error:
TypeError: Input 'ref' of 'AssignAdd' Op requires l-value input
What am I doing wrong? Unfortunately, I haven't managed to find some good example of network training with decay. My whole code is below. Network has 2 hidden ReLU layers, has L2 penalty on weights and has dropout on both hidden layers.
#We try the following - 2 ReLU layers
#Dropout on both of them
#Also L2 regularization on them
#and learning rate decay also
#batch size for SGD
batch_size = 128
#beta parameter for L2 loss
beta = 0.001
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#learning rate decay
#starting value, number of steps decay is performed,
#size of the decay
start_learning_rate = 0.05
decay_steps = 1000
decay_size = 0.95
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#now let's build our first hidden layer
#its weights
hidden_weights_1 = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases_1 = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer 1 itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer_1 = tf.nn.relu(tf.matmul(tf_train_dataset, hidden_weights_1) + hidden_biases_1)
#add dropout on hidden layer 1
#we pick up the probabylity of switching off the activation
#and perform the switch off of the activations
keep_prob = tf.placeholder("float")
hidden_layer_drop_1 = tf.nn.dropout(hidden_layer_1, keep_prob)
#now let's build our second hidden layer
#its weights
hidden_weights_2 = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_hidden_neurons]))
hidden_biases_2 = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer 2 itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer_2 = tf.nn.relu(tf.matmul(hidden_layer_drop_1, hidden_weights_2) + hidden_biases_2)
#add dropout on hidden layer 2
#we pick up the probabylity of switching off the activation
#and perform the switch off of the activations
hidden_layer_drop_2 = tf.nn.dropout(hidden_layer_2, keep_prob)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
#notice that upon training we use the switched off activations
#i.e. the variaction of hidden_layer with the dropout active
out_layer = tf.matmul(hidden_layer_drop_2,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights_1) +
beta*tf.nn.l2_loss(hidden_biases_1) +
beta*tf.nn.l2_loss(hidden_weights_2) +
beta*tf.nn.l2_loss(hidden_biases_2) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#variable to count number of steps taken
global_step = tf.placeholder(tf.int32)
#compute current learning rate
learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_steps, decay_size)
#use it in optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu_1 = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights_1) + hidden_biases_1)
valid_relu_2 = tf.nn.relu( tf.matmul(valid_relu_1, hidden_weights_2) + hidden_biases_2)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu_2, out_weights) + out_biases)
test_relu_1 = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights_1) + hidden_biases_1)
test_relu_2 = tf.nn.relu( tf.matmul( test_relu_1, hidden_weights_2) + hidden_biases_2)
test_prediction = tf.nn.softmax(tf.matmul(test_relu_2, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5, global_step: step}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Instead of using a placeholder for global_step, try using a Variable.
global_step = tf.Variable(0)
You will have to remove global_step from the feed_dict. Note that you don't have to increment global_step manually, tensorflow will do it automatically for you.

How does one do Inference with Batch Normalization with Tensor Flow?

I was reading the original paper on BN and the stack overflow question on How could I use Batch Normalization in TensorFlow? which provides a very useful piece of code to insert a batch normalization block to a Neural Network but does not provides enough guidance on how to actually use it during training, inference and when evaluating models.
For example, I would like to track the train error during training and test error to make sure I don't overfit. Its clear that the batch normalization block should be off during test, but when evaluating the error on the training set, should the batch normalization block be turned off too? My main questions are:
During inference and error evaluation, should the batch normalization block be turned off regardless of the data set?
Does that mean that the batch normalization block should only be on during the training step then?
To make it very clear, I will provide an extract (of simplified) code I have been using to run batch normalization with Tensor flow according to what is my understanding of what is the right thing to do:
## TRAIN
if phase_train is not None:
#DO BN
feed_dict_train = {x:X_train, y_:Y_train, phase_train: False}
feed_dict_cv = {x:X_cv, y_:Y_cv, phase_train: False}
feed_dict_test = {x:X_test, y_:Y_test, phase_train: False}
else:
#Don't do BN
feed_dict_train = {x:X_train, y_:Y_train}
feed_dict_cv = {x:X_cv, y_:Y_cv}
feed_dict_test = {x:X_test, y_:Y_test}
def get_batch_feed(X, Y, M, phase_train):
mini_batch_indices = np.random.randint(M,size=M)
Xminibatch = X[mini_batch_indices,:] # ( M x D^(0) )
Yminibatch = Y[mini_batch_indices,:] # ( M x D^(L) )
if phase_train is not None:
#DO BN
feed_dict = {x: Xminibatch, y_: Yminibatch, phase_train: True}
else:
#Don't do BN
feed_dict = {x: Xminibatch, y_: Yminibatch}
return feed_dict
with tf.Session() as sess:
sess.run( tf.initialize_all_variables() )
for iter_step in xrange(steps):
feed_dict_batch = get_batch_feed(X_train, Y_train, M, phase_train)
# Collect model statistics
if iter_step%report_error_freq == 0:
train_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_train)
cv_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_cv)
test_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_test)
do_stuff_with_errors(train_error, cv_error, test_error)
# Run Train Step
sess.run(fetches=train_step, feed_dict=feed_dict_batch)
and the code I am using to produce batch normalization blocks is:
def standard_batch_norm(l, x, n_out, phase_train, scope='BN'):
"""
Batch normalization on feedforward maps.
Args:
x: Vector
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope+l):
#beta = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float64 ), name='beta', trainable=True, dtype=tf.float64 )
#gamma = tf.Variable(tf.constant(1.0, shape=[n_out],dtype=tf.float64 ), name='gamma', trainable=True, dtype=tf.float64 )
init_beta = tf.constant(0.0, shape=[n_out], dtype=tf.float64)
init_gamma = tf.constant(1.0, shape=[n_out],dtype=tf.float64)
beta = tf.get_variable(name='beta'+l, dtype=tf.float64, initializer=init_beta, regularizer=None, trainable=True)
gamma = tf.get_variable(name='gamma'+l, dtype=tf.float64, initializer=init_gamma, regularizer=None, trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
I found that there is 'official' batch_norm layer in tensorflow. Try it out:
https://github.com/tensorflow/tensorflow/blob/b826b79718e3e93148c3545e7aa3f90891744cc0/tensorflow/contrib/layers/python/layers/layers.py#L100
Most likely it is not mentioned in docs since it included in some RC or 'beta' version only.
I haven't inspected deep into this matter yet, but as far as I see from documentation you just use binary parameter is_training in this batch_norm layer, and set it to true only for training phase. Try it out.
UPDATE: Below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine.
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Resources