TensorFlow classification does not match actual output - machine-learning

I wrote a code in TensorFlow for linear classification. I generated the fake data based on a rule which is "If the difference is greater than x(some constant) the output should be [1,0] else the output should be [0,1]. Here is my code
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def main(_):
# Import data
# Create the model
x = tf.placeholder(tf.float32, [None, 2])
W1 = weight_variable([2, 2])
b1 = bias_variable([2])
y2 = tf.nn.softmax(tf.matmul(x, W1) + b1)
# Define loss and optimizer'''
y_ = tf.placeholder(tf.float32, [None, 2])
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y2))
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
# Train
for _ in range(10000):
batch_xs, batch_ys = data_supplier.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
# Test trained model
test_batch_x, test_batch_y = data_supplier.test_data()
correct_prediction = tf.equal(tf.argmax(y2, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: test_batch_x,
y_: test_batch_y}))
print(x.eval(feed_dict={x: test_batch_x, y_: test_batch_y}))
print(y2.eval(feed_dict={x: test_batch_x, y_: test_batch_y}))
print(y_.eval(feed_dict={x: test_batch_x, y_: test_batch_y}))
print(W1.eval())
print(b1.eval())
print(cross_entropy.eval(feed_dict={x: test_batch_x, y_: test_batch_y}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
And here's the data_supplier code:
TOTAL_DATA_SIZE = 50000
TRAIN_DATA_SIZE = 40000
VALIDATION_DATA_SIZE = 5000
TEST_DATA_SIZE = 5000
COLUMNS = ["a", "b","output", "outputbar"]
FEATURES = ["a", "b"]
LABELS = ["output", "outputbar"]
training_set = pd.read_csv("train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
training_set_features = training_set.as_matrix(columns=FEATURES)
training_set_labels = training_set.as_matrix(columns=LABELS)
test_set = pd.read_csv("test.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
test_set_features = test_set.as_matrix(columns=FEATURES)
test_set_labels = test_set.as_matrix(columns=LABELS)
def next_batch(BATCH_SIZE):
k = np.random.randint(0,TRAIN_DATA_SIZE-BATCH_SIZE)
return training_set_features[k:k+BATCH_SIZE], training_set_labels[k:k+BATCH_SIZE]
def test_data():
return test_set_features, test_set_labels
And here's the output:
accuracy: 0.6852
Input: [[ 0.51166666 0.79333335]
[ 0.85833335 0.21833333]
[ 0.80333334 0.48333332]
...,
[ 0.28333333 0.96499997]
[ 0.97666669 0.84833336]
[ 0.57666665 0.21333334]]
Predictions: [[ 0.80804855 0.19195142]
[ 0.78380686 0.21619321]
[ 0.80210352 0.19789645]
...,
[ 0.80708122 0.19291875]
[ 0.83949649 0.16050354]
[ 0.76328743 0.23671262]]
Actual output: [[ 0. 1.]
[ 1. 0.]
[ 1. 0.]
...,
[ 1. 0.]
[ 1. 0.]
[ 1. 0.]]
Weights: [[ 0.3034386 -0.10369452]
[ 0.29422989 -0.21103808]]
Bias: [ 0.5141086 -0.3141087]
Cross Entropy: 0.624272
The accuracy is meaningless currently as every prediction is classified into [1,0]. What is the mistake I'm making?

You should use the softmax_cross_entropy_with_logits() in the way it is designed for: https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits.
The warning says that the function expects unscaled logits, i.e. you should not perform a softmax operation for y2: y2 = tf.matmul(x, W1) + b1. For the test set, you will have to perform a softmax operation: y_out_test = tf.nn.softmax(y2) or something like that.
Maybe this will already solve your problem.
If not, if only a single class is predicted, it is often a hint to an imbalance in the data set, i.e. that one class occurs much more frequently than the other. You should check whether this is the case for you. If so, you might find some advice on how to deal with this, for example here: http://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/. I did not check this site in detail, so I cannot tell you whether it is particularly helpful, but you might want to consult some web pages if your data set is heavily imbalanced.

Related

How to compute the uncertainty of a Monte Carlo Dropout neural network with PyTorch?

I am trying to implement Bayesian CNN using Mc Dropout on Pytorch, the main idea is that by applying dropout at test time and running over many forward passes, you get predictions from a variety of different models. I need to obtain the uncertainty, does anyone have an idea of how I can do it Please
This is how I defined my CNN
'''
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.dropout = nn.Dropout(p=0.3)
nn.init.xavier_uniform_(self.conv1.weight)
nn.init.constant_(self.conv1.bias, 0.0)
nn.init.xavier_uniform_(self.conv2.weight)
nn.init.constant_(self.conv2.bias, 0.0)
nn.init.xavier_uniform_(self.fc1.weight)
nn.init.constant_(self.fc1.bias, 0.0)
nn.init.xavier_uniform_(self.fc2.weight)
nn.init.constant_(self.fc2.bias, 0.0)
nn.init.xavier_uniform_(self.fc3.weight)
nn.init.constant_(self.fc3.bias, 0.0)
def forward(self, x):
x = self.pool(F.relu(self.dropout(self.conv1(x)))) # recommended to add the relu
x = self.pool(F.relu(self.dropout(self.conv2(x)))) # recommended to add the relu
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(self.dropout(x)))
x = self.fc3(self.dropout(x)) # no activation function needed for the last layer
return x
model = Net().to(device)
train_accuracies=np.zeros(num_epochs)
test_accuracies=np.zeros(num_epochs)
dataiter = iter(trainloader)
images, labels = dataiter.next()
#initializing variables
loss_acc = []
class_acc_mcdo = []
start_train = True
#Defining the Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
def train():
loss_vals = []
acc_vals = []
for epoch in range(num_epochs): # loop over the dataset multiple times
n_correct = 0 # initialize number of correct predictions
acc = 0 # initialize accuracy of each epoch
somme = 0 # initialize somme of losses of each epoch
epoch_loss = []
for i, (images, labels) in enumerate(trainloader):
# origin shape: [4, 3, 32, 32] = 4, 3, 1024
# input_layer: 3 input channels, 6 output channels, 5 kernel size
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model.train()(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad() # zero the parameter gradients
loss.backward()
epoch_loss.append(loss.item()) # add the loss to epoch_loss list
optimizer.step()
# max returns (value ,index)
_, predicted = torch.max(outputs, 1)
n_correct += (predicted == labels).sum().item()
# print statistics
if (i + 1) % 2000 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{n_total_steps}], Loss:
{loss.item():.4f}')
somme = (sum(epoch_loss)) / len(epoch_loss)
loss_vals.append(somme) # add the epoch's loss to loss_vals
print("Loss = {}".format(somme))
acc = 100 * n_correct / len(trainset)
acc_vals.append(acc) # add the epoch's Accuracy to acc_vals
print("Accuracy = {}".format(acc))
# SAVE
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)
loss_acc.append(loss_vals)
loss_acc.append(acc_vals)
return loss_acc
And here is the code of the mc dropout
'''
def enable_dropout(model):
""" Function to enable the dropout layers during test-time """
for m in model.modules():
if m.__class__.__name__.startswith('Dropout'):
m.train()
def test():
# set non-dropout layers to eval mode
model.eval()
# set dropout layers to train mode
enable_dropout(model)
test_loss = 0
correct = 0
n_samples = 0
n_class_correct = [0 for i in range(10)]
n_class_samples = [0 for i in range(10)]
T = 100
for images, labels in testloader:
images = images.to(device)
labels = labels.to(device)
with torch.no_grad():
output_list = []
# getting outputs for T forward passes
for i in range(T):
output_list.append(torch.unsqueeze(model(images), 0))
# calculating mean
output_mean = torch.cat(output_list, 0).mean(0)
test_loss += F.nll_loss(F.log_softmax(output_mean, dim=1), labels,
reduction='sum').data # sum up batch loss
_, predicted = torch.max(output_mean, 1) # get the index of the max log-probability
correct += (predicted == labels).sum().item() # sum up correct predictions
n_samples += labels.size(0)
for i in range(batch_size):
label = labels[i]
predi = predicted[i]
if (label == predi):
n_class_correct[label] += 1
n_class_samples[label] += 1
test_loss /= len(testloader.dataset)
# PRINT TO HTML PAGE
print('\n Average loss: {:.4f}, Accuracy: ({:.3f}%)\n'.format(
test_loss,
100. * correct / n_samples))
# Accuracy for each class
acc_classes = []
for i in range(10):
acc = 100.0 * n_class_correct[i] / n_class_samples[i]
print(f'Accuracy of {classes[i]}: {acc} %')
acc_classes.append(acc)
class_acc_mcdo.extend(acc_classes)
print('Finished Testing')
You can compute the statistics, such as the sample mean or the sample variance, of different stochastic forward passes at test time (i.e. with the test or validation data), when the dropout is enabled. These statistics can be used to represent uncertainty. For example, you can compute the entropy, which is a measure of uncertainty, from the sample mean.

Predicting probabilities in classfier tensorflow

Hey i am pretty new to tensorflow. I am building a classification model basically classifying into 0/1. Is there a way to predict probability of output being 1. Can predict_proba be used over here? Its been widely used in tflearn.dnn but can't find any reference to do it in my case.
def main():
train_x,test_x,train_y,test_y = load_csv_data()
x_size = train_x.shape[1]
y_size = train_y.shape[1]
print(x_size)
print(y_size)
# variables
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])
weights_1 = initialize_weights((x_size, h_size))
weights_2 = initialize_weights((h_size, y_size))
# Forward propagation
y_pred = forward_propagation(X, weights_1, weights_2)
predict = tf.argmax(y_pred, dimension=1)
# Backward propagation
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_pred))
updates_sgd = tf.train.GradientDescentOptimizer(sgd_step).minimize(cost)
# Start tensorflow session
with tf.Session() as sess:
init = tf.global_variables_initializer()
steps = 1
sess.run(init)
x = np.arange(steps)
test_acc = []
train_acc = []
print("Step, train accuracy, test accuracy")
for step in range(steps):
# Train with each example
batch_size = len(train_x)
avg_cost = 0
print(batch_size)
for i in range(len(train_x)):
_, c = sess.run([updates_sgd,cost], feed_dict={X: train_x[i: i + 1], y: train_y[i: i + 1]})
print(c)
avg_cost += c/batch_size
train_accuracy = np.mean(np.argmax(train_y, axis=1) ==
sess.run(predict, feed_dict={X: train_x, y: train_y}))
test_accuracy = np.mean(np.argmax(test_y, axis=1) ==
sess.run(predict, feed_dict={X: test_x, y: test_y}))
print(avg_cost)
print("%d, %.2f%%, %.2f%%"
% (step + 1, 100. * train_accuracy, 100. * test_accuracy))
test_acc.append(100. * test_accuracy)
train_acc.append(100. * train_accuracy)
predict = tf.argmax(y_pred,1)
test_data = load_test_data( )
print(test_data)
pred = predict.eval(feed_dict={X:test_data})
print(pred)
for x in range(0,100):
print(pred[x])
print(np.unique(pred))
main()
Here you take argmax of probabilities:
predict = tf.argmax(y_pred, dimension=1)
If you return simply "y_pred" you should get probabilities.

Why won't my trivial LSTM overfit?

I created a very trivial LSTM to try to predict a short sequence, but it won't overfit and approach a loss of zero the way I expect.
Instead it just converges around a loss of ~1.5, even if it definitely has enough degrees of freedom to learn this sequence verbatim.
import tensorflow as tf
import time
tf.logging.set_verbosity(tf.logging.DEBUG)
#
# Training data, just a single sequence
#
train_input = [[0, 1, 2, 3, 4, 5, 0, 6, 7, 0]]
train_output = [[1, 2, 3, 4, 5, 0, 6, 7, 8, 0]]
#
# Training metadata
#
batch_size = 1
sequence_length = 10
n_classes = 9
# Network size
rnn_cell_size = 10
rnn_layers = 2
embedding_rank = 3
#
# Training hyperparameters
#
epochs = 100
n_batches = 100
learning_rate = 0.01
#
# Model
#
features = tf.placeholder(tf.int32, [None, sequence_length], name="features")
embeddings = tf.Variable(tf.random_uniform([n_classes, embedding_rank], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, features)
cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_cell_size) for i in range(rnn_layers)])
initial_state = cell.zero_state(batch_size, tf.float32)
cell, _ = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
# Convert sequences x batches x outputs to (sequences * batches) x outputs
flat_lstm_output = tf.reshape(cell, [-1, rnn_cell_size])
output = tf.contrib.layers.fully_connected(inputs=flat_lstm_output, num_outputs=n_classes)
softmax = tf.nn.softmax(output)
#
# Training
#
targets = tf.placeholder(tf.int32, [None, sequence_length])
# Convert sequences x batches x targets to (sequences * batches) x targets
flat_targets = tf.reshape(targets, [-1])
loss = tf.losses.sparse_softmax_cross_entropy(flat_targets, softmax)
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(epochs):
loss_sum = 0
epoch_start = time.time()
for j in range(n_batches):
_, step_loss = sess.run([train_op, loss], {
features: train_input,
targets: train_output,
})
loss_sum = loss_sum + step_loss
print('avg_loss', loss_sum / n_batches, 'avg_time', (time.time() - epoch_start) / n_batches)
I get the feeling something very basic is missing here - what am I doing wrong?
EDIT
I tried to simplify it even more, and now I'm down to the following even more trivial example (that also doesn't converge):
import tensorflow as tf
import time
tf.logging.set_verbosity(tf.logging.DEBUG)
#
# Training data, just a single sequence
#
train_input = [0, 1, 2, 3, 4]
train_output = [1, 2, 3, 4, 5]
#
# Training metadata
#
batch_size = 1
sequence_length = 5
n_classes = 6
#
# Training hyperparameters
#
epochs = 100
n_batches = 100
learning_rate = 0.01
#
# Model
#
features = tf.placeholder(tf.int32, [None])
one_hot = tf.contrib.layers.one_hot_encoding(features, n_classes)
output = tf.contrib.layers.fully_connected(inputs=one_hot, num_outputs=10)
output = tf.contrib.layers.fully_connected(inputs=output, num_outputs=n_classes)
#
# Training
#
targets = tf.placeholder(tf.int32, [None])
one_hot_targets = tf.one_hot(targets, depth=n_classes)
loss = tf.losses.softmax_cross_entropy(one_hot_targets, output)
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(epochs):
loss_sum = 0
epoch_start = time.time()
for j in range(n_batches):
_, step_loss = sess.run([train_op, loss], {
features: train_input,
targets: train_output,
})
loss_sum = loss_sum + step_loss
print('avg_loss', loss_sum / n_batches, 'avg_time', (time.time() - epoch_start) / n_batches)
Did you check the lower values for the learning rate (e.g., 0.001 or 0.0001)?
Your networks aren't fitting (let alone overfitting) because you don't have enough data. The LSTM has only one sequence and the MLP has 5 datapoints.
Compare this with the number of parameters you need to estimate: your MLP has 120 parameters (if I'm counting correctly). There is no way you can estimate all these with only 5 datapoints unless you're very lucky. (you can make it more likely to converge by splitting your sequence into smaller batches, but even then it won't converge very often).
In short, neural networks need a decent amount of data to be usable.
The answer was three-fold.
1) The example without the RNN converges if I replace the default activation in the fully connected layers (relu) with tanh.
This seems to be because the relu ignores a lot of input (everything below zero) and doesn't provide a gradient at all. With more input it might have worked.
2) The example WITH the RNN needs to remove the activation in the final fully connected layer (before the softmax) completely using None - it doesn't converge well (or at all, in most combinations) with an activation of the fully connected layer in front of the softmax.
3) The RNN example also needs to remove the explicit softmax, since sparse_softmax_cross_entropy applies softmax already.
Finally working code:
import tensorflow as tf
import time
tf.logging.set_verbosity(tf.logging.DEBUG)
#
# Training data, just a single sequence
#
train_input = [[0, 1, 2, 3, 4, 5, 0, 6, 7, 0]]
train_output = [[1, 2, 3, 4, 5, 0, 6, 7, 8, 0]]
#
# Training metadata
#
batch_size = 1
sequence_length = 10
n_classes = 9
# Network size
rnn_cell_size = 10
rnn_layers = 2
embedding_rank = 3
#
# Training hyperparameters
#
epochs = 100
n_batches = 100
learning_rate = 0.01
#
# Model
#
features = tf.placeholder(tf.int32, [None, sequence_length], name="features")
embeddings = tf.Variable(tf.random_uniform([n_classes, embedding_rank], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, features)
cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_cell_size) for i in range(rnn_layers)])
initial_state = cell.zero_state(batch_size, tf.float32)
cell, _ = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
# Convert [batche_size, sequence_length, rnn_cell_size] to [(batch_size * sequence_length), rnn_cell_size]
flat_lstm_output = tf.reshape(cell, [-1, rnn_cell_size])
output = tf.contrib.layers.fully_connected(inputs=flat_lstm_output, num_outputs=n_classes, activation_fn=None)
#
# Training
#
targets = tf.placeholder(tf.int32, [None, sequence_length])
# Convert [batch_size, sequence_length] to [batch_size * sequence_length]
flat_targets = tf.reshape(targets, [-1])
loss = tf.losses.sparse_softmax_cross_entropy(flat_targets, output)
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(epochs):
loss_sum = 0
epoch_start = time.time()
for j in range(n_batches):
_, step_loss = sess.run([train_op, loss], {
features: train_input,
targets: train_output,
})
loss_sum = loss_sum + step_loss
print('avg_loss', loss_sum / n_batches, 'avg_time', (time.time() - epoch_start) / n_batches)

Why does the location of saver in the script matter when there is a graph object in TensorFlow?

I was training some models and I noticed that when I explicitly defined a graph variable, then it mattered where my saver object was created. First my code looked like this:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("tmp_MNIST_data/", one_hot=True)
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.truncated_normal([784, 10], mean=0.0, stddev=0.1),name='w')
b = tf.Variable(tf.constant(0.1, shape=[10]),name='b')
y = tf.nn.softmax(tf.matmul(x, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) # list of booleans indicating correct predictions
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_step = tf.train.GradientDescentOptimizer(0.2).minimize(cross_entropy)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(1001):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_: batch_ys})
if i % 100 == 0:
saver.save(sess=sess,save_path='./tmp/mdl_ckpt')
print(sess.run(fetches=accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
then I decided to change it to something like this and it seemed very sensitive where I defined my variables and where I defined saver. If they were not defined exactly after the graph variable was created for example, it would have errors. Similarly, I noticed that saver had to be defined exactly after one single variable (note being after the definition of the graph was not enough) for all the variables to be captured together by the saver (which didn't make sense to me, it would make more sense to require that its behind the definition of all the variables rather than a single one for this to work).
This is how the code looks now (with comments showing the locations of where I've defined saver):
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("tmp_MNIST_data/", one_hot=True)
graph = tf.Graph()
with tf.Session(graph=graph) as sess:
#saver = tf.train.Saver()
x = tf.placeholder(tf.float32, [None, 784])
saver = tf.train.Saver()
y_ = tf.placeholder(tf.float32, [None, 10])
#saver = tf.train.Saver()
W = tf.Variable(tf.truncated_normal([784, 10], mean=0.0, stddev=0.1),name='w')
#saver = tf.train.Saver()
b = tf.Variable(tf.constant(0.1, shape=[10]),name='b')
y = tf.nn.softmax(tf.matmul(x, W) + b)
#saver = tf.train.Saver()
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) # list of booleans indicating correct predictions
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
#saver = tf.train.Saver()
step = tf.Variable(0, name='step')
#saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
#saver = tf.train.Saver()
for i in range(1001):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_: batch_ys})
if i % 100 == 0:
step_assign = step.assign(i)
sess.run(step_assign)
saver.save(sess=sess,save_path='./tmp/mdl_ckpt')
print(step.eval())
print( [ op.name for op in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)] )
print(sess.run(fetches=accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
the code above should work, but I am having a hard time understand why it behaves like this or why it makes sense that this would happen. Someone know what the right thing to do is?
I'm not entirely sure what's going on here, but I suspect the issue is related to variables not going into the wrong graph, or the session having an outdated version of the graph. You create a graph, but don't set it as the default, then create a session with that graph... but when you create variables, you don't specify which graph they should go into. Maybe the creation of the session sets the specified graph to the default, but that's not the way tensorflow was designed to be used, so I wouldn't be surprised if it hasn't been thoroughly tested in this regime.
While I don't have an explanation or what's going on, I can suggest a simple solution: separate graph construction with session running.
graph = tf.Graph()
with graph.as_default():
build_graph()
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
do_stuff_with(sess)
saver.save(sess, path)

How does one do Inference with Batch Normalization with Tensor Flow?

I was reading the original paper on BN and the stack overflow question on How could I use Batch Normalization in TensorFlow? which provides a very useful piece of code to insert a batch normalization block to a Neural Network but does not provides enough guidance on how to actually use it during training, inference and when evaluating models.
For example, I would like to track the train error during training and test error to make sure I don't overfit. Its clear that the batch normalization block should be off during test, but when evaluating the error on the training set, should the batch normalization block be turned off too? My main questions are:
During inference and error evaluation, should the batch normalization block be turned off regardless of the data set?
Does that mean that the batch normalization block should only be on during the training step then?
To make it very clear, I will provide an extract (of simplified) code I have been using to run batch normalization with Tensor flow according to what is my understanding of what is the right thing to do:
## TRAIN
if phase_train is not None:
#DO BN
feed_dict_train = {x:X_train, y_:Y_train, phase_train: False}
feed_dict_cv = {x:X_cv, y_:Y_cv, phase_train: False}
feed_dict_test = {x:X_test, y_:Y_test, phase_train: False}
else:
#Don't do BN
feed_dict_train = {x:X_train, y_:Y_train}
feed_dict_cv = {x:X_cv, y_:Y_cv}
feed_dict_test = {x:X_test, y_:Y_test}
def get_batch_feed(X, Y, M, phase_train):
mini_batch_indices = np.random.randint(M,size=M)
Xminibatch = X[mini_batch_indices,:] # ( M x D^(0) )
Yminibatch = Y[mini_batch_indices,:] # ( M x D^(L) )
if phase_train is not None:
#DO BN
feed_dict = {x: Xminibatch, y_: Yminibatch, phase_train: True}
else:
#Don't do BN
feed_dict = {x: Xminibatch, y_: Yminibatch}
return feed_dict
with tf.Session() as sess:
sess.run( tf.initialize_all_variables() )
for iter_step in xrange(steps):
feed_dict_batch = get_batch_feed(X_train, Y_train, M, phase_train)
# Collect model statistics
if iter_step%report_error_freq == 0:
train_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_train)
cv_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_cv)
test_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_test)
do_stuff_with_errors(train_error, cv_error, test_error)
# Run Train Step
sess.run(fetches=train_step, feed_dict=feed_dict_batch)
and the code I am using to produce batch normalization blocks is:
def standard_batch_norm(l, x, n_out, phase_train, scope='BN'):
"""
Batch normalization on feedforward maps.
Args:
x: Vector
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope+l):
#beta = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float64 ), name='beta', trainable=True, dtype=tf.float64 )
#gamma = tf.Variable(tf.constant(1.0, shape=[n_out],dtype=tf.float64 ), name='gamma', trainable=True, dtype=tf.float64 )
init_beta = tf.constant(0.0, shape=[n_out], dtype=tf.float64)
init_gamma = tf.constant(1.0, shape=[n_out],dtype=tf.float64)
beta = tf.get_variable(name='beta'+l, dtype=tf.float64, initializer=init_beta, regularizer=None, trainable=True)
gamma = tf.get_variable(name='gamma'+l, dtype=tf.float64, initializer=init_gamma, regularizer=None, trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
I found that there is 'official' batch_norm layer in tensorflow. Try it out:
https://github.com/tensorflow/tensorflow/blob/b826b79718e3e93148c3545e7aa3f90891744cc0/tensorflow/contrib/layers/python/layers/layers.py#L100
Most likely it is not mentioned in docs since it included in some RC or 'beta' version only.
I haven't inspected deep into this matter yet, but as far as I see from documentation you just use binary parameter is_training in this batch_norm layer, and set it to true only for training phase. Try it out.
UPDATE: Below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine.
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Resources