Why can't I restore this model? - machine-learning

I am currently having trouble restoring this model to make a prediction.
Code:
def neural_network(data):
with tf.name_scope("network"):
layer1 = tf.layers.dense(data, 1000, activation=tf.nn.relu, name="hidden_layer1")
layer2 = tf.layers.dense(layer1, 1000, activation=tf.nn.relu, name="hidden_layer2")
output = tf.layers.dense(layer2, 2, name="output_layer")
return output
def evaluate():
with tf.name_scope("loss"):
global x
xentropy = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=neural_network(x))
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer()
training_op = optimizer.minimize(loss)
with tf.name_scope("exec"):
with tf.Session() as sess:
for i in range(1, 10):
sess.run(tf.global_variables_initializer())
sess.run(training_op, feed_dict={x: np.array(train_data).reshape([-1, 1]), y: label})
print "Training " + str(i)
saver = tf.train.Saver()
saver.save(sess, "saved_models/testing")
print "Model Saved."
def predict():
with tf.name_scope("predict"):
output = neural_network(x)
output = tf.nn.softmax(output)
with tf.Session() as sess:
saver = tf.train.import_meta_graph("saved_models/testing.meta")
# saver = tf.train.Saver()
saver.restore(sess, "saved_models/testing")
print sess.run(output, feed_dict={x: np.array([12003]).reshape([-1, 1])})
I have tried using tf.train.Saver() to restore but also gives the same error.
The error given is ValueError: Variable hidden_layer1/kernel already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:
I have tried setting reuse=True for tf.layers.dense() but it results in me unable to train the graph (gives the same ValueError as above but asking to set reuse=None).
I am guessing it has to do with the graph still existing in the session so when I try to restore it, it detects a duplicate graph. However, I thought this should not happen as the session have already closed.
link to entire code: gistlink

I think you are loading the variables in the same graph. For testing try to create a new graph and load it. Do something like this:
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
# Load the graph with the trained states

Related

Unable restore variables of Adam Optimizer while using tf.train.save

I get following errors when I try to restore a saved model in tensorflow:
W tensorflow/core/framework/op_kernel.cc:1192] Not found: Key out_w/Adam_5 not found in checkpoint
W tensorflow/core/framework/op_kernel.cc:1192] Not found: Key b1/Adam not found in checkpoint
W tensorflow/core/framework/op_kernel.cc:1192] Not found: Key b1/Adam_4 not found in checkpoint
I guess I am unable to save Variables of Adam Optimizer.
Any fix?
Consider this small experiment:
import tensorflow as tf
def simple_model(X):
with tf.variable_scope('Layer1'):
w1 = tf.get_variable('w1', initializer=tf.truncated_normal((5, 2)))
b1 = tf.get_variable('b1', initializer=tf.ones((2)))
layer1 = tf.matmul(X, w1) + b1
return layer1
def simple_model2(X):
with tf.variable_scope('Layer1'):
w1 = tf.get_variable('w1_x', initializer=tf.truncated_normal((5, 2)))
b1 = tf.get_variable('b1_x', initializer=tf.ones((2)))
layer1 = tf.matmul(X, w1) + b1
return layer1
tf.reset_default_graph()
X = tf.placeholder(tf.float32, shape = (None, 5))
model = simple_model(X)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.save(sess, './Checkpoint', global_step = 0)
tf.reset_default_graph()
X = tf.placeholder(tf.float32, shape = (None, 5))
model = simple_model(X) # Case 1
#model = simple_model2(X) # Case 2
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
tf.train.Saver().restore(sess, tf.train.latest_checkpoint('.'))
In Case 1, everything works fine. But in Case2, you will get errors like Key Layer1/b1_x not found in checkpoint which is because the variable names in the model are different (though the shapes and datatypes of both variables are same). Ensure that variables are having same names in the model in which you are restoring.
To check the names of the variables present in the checkpoint, check this answer.
This can also happen when you are not training every variable simultaneously, due to only partially available adam parameters in a checkpoint.
One possible fix would be to "reset" Adam after loading the checkpoint. To to this, filter adam-related variables when creating the saver:
vl = [v for v in tf.global_variables() if "Adam" not in v.name]
saver = tf.train.Saver(var_list=vl)
Make sure to initialize global variables afterwards.

Tensorflow - How is the graph executed?

I am trying to get an output of an activation function as the weights change. When the weights change I expect the activation function to change as well.
I am simply changing the weights before the activation and I get a change in the value of the activation.
import tensorflow as tf
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+tf.exp(-x))) * (1.0 - (1.0/(1+tf.exp(-x))))
return 1.0/(1+tf.exp(-x))
def dummy(x):
weights['h0'] = tf.assign(weights['h0'], tf.add(weights['h0'], 0.1))
res = tf.add(weights['h0'], x)
res = sigmoid(res)
return res
# build computational graph
a = tf.placeholder('float', None)
d = dummy(a)
weights = {
'h0': tf.Variable(tf.random_normal([1]))
}
# initialize variables
init = tf.global_variables_initializer()
# create session and run the graph
with tf.Session() as sess:
sess.run(init)
for i in range(10):
print (sess.run(d, feed_dict={a: [2]}))
# close session
sess.close()
But when I try to change the weights after the activation such as in backprop, I get the same activation every time. Can anyone explain to me what is happening and what I can do to get the activation to change after every iteration?
import tensorflow as tf
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+tf.exp(-x))) * (1.0 - (1.0/(1+tf.exp(-x))))
return 1.0/(1+tf.exp(-x))
def dummy(x):
res = tf.add(weights['h0'], x)
res = sigmoid(res)
weights['h0'] = tf.assign(weights['h0'], tf.add(weights['h0'], 0.1))
return res
# build computational graph
a = tf.placeholder('float', None)
d = dummy(a)
weights = {
'h0': tf.Variable(tf.random_normal([1]))
}
# initialize variables
init = tf.global_variables_initializer()
# create session and run the graph
with tf.Session() as sess:
sess.run(init)
for i in range(10):
print (sess.run(d, feed_dict={a: [2]}))
# close session
sess.close()
EDIT:
It seems like it is not running the entire graph? I can do this:
with tf.Session() as sess:
sess.run(init)
for i in range(10):
sess.run(weights['h0'])
print (sess.run(d, feed_dict={a: [2]}))
Where I run the weights and it gives me different values. Is that correct?
This line isn't doing what you think it's doing:
print (sess.run(d, feed_dict={a: [2]}))
You need to call sess.run() and pass in a training operation, which is usually an optimizer's minimize() function.
Below are some example usages.
From the super-simple Tensorflow MNIST example:
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
...
for _ in range(1000):
...
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
From a TensorFlow multi-layer NN example:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\
logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
...
for i in range(total_batch):
...
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
The general pattern is:
Define cost function J.
Add the cost variable J to an optimizer
Call sess.run() with the optimizer variable as an argument.
If you want to write your own optimizer, then you'll need to take a different approach. Writing your own cost function is typical, but writing your own optimizer is not. You should look at the code for AdamOptimizer or GradientDescentOptimizer for insight.

TensorFlow image reading queue empty

I'm trying to use the pipeline for reading images to the CNN. I used string_input_producer() to obtain the queue of file names, but it seems to hang there without doing anything. Below is my code, please give me some advise of how to make it work.
def read_image_file(filename_queue, labels):
reader = tf.WholeFileReader()
key, value = reader.read(filename_queue)
image = tf.image.decode_png(value, channels=3)
image = tf.cast(image, tf.float32)
resized_image = tf.image.resize_images(image, [224, 112])
with tf.Session() as sess:
label = getLabel(labels, key.eval())
return resized_image, label
def input_pipeline(filename_queue, queue_names, batch_size, num_epochs, labels):
image, label = read_image_file(filename_queue, labels)
min_after_dequeue = 10 * batch_size
capacity = 20 * batch_size
image_batch, label_batch = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=1, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return image_batch, label_batch
train_queue = tf.train.string_input_producer(trainnames, shuffle=True, num_epochs=epochs)
train_batch, train_label = input_pipeline(train_queue, trainnames, batch_size, epochs, labels)
prediction = AlexNet(x)
#Training
with tf.name_scope("cost_function") as scope:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=train_label, logits=prediction(train_batch)))
tf.summary.scalar("cost_function", cost)
train_step = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost)
#Accuracy
with tf.name_scope("accuracy") as scope:
correct_prediction = tf.equal(tf.argmax(prediction,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar("accuracy", accuracy)
merged = tf.summary.merge_all()
#Session
with tf.Session() as sess:
print('started')
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord, start=True)
sess.run(threads)
try:
for step in range(steps_per_epch * epochs):
print('step: %d' %step)
sess.run(train_step)
except tf.errors.OutOfRangeError as ex:
pass
coord.request_stop()
coord.join(threads)
Your code is not completely self-contained as the get_label method is not defined.
But it is very likely that the issue you have comes from these lines in the read_image_file method:
with tf.Session() as sess:
label = getLabel(labels, key.eval())
The key.eval part tries to dequeue an element of a queue which has not started yet.
You shouldn't create any session before your input pipeline is defined (nor try to eval key (and possibly labels)). The get_label method should only perform tensor operations on labels and key and return a label tensor..
For example, you can use these tensor string operations so they will be part of the graph.

Unable to Restore Graph

Problem:
Attempting to restore meta_graph via tf.train.import_meta_graph("saved_models/model.meta") gives the following error:
InvalidArgumentError (see above for traceback): Shape [-1] has negative dimensions
[[Node: Placeholder_2 = Placeholder[dtype=DT_INT32, shape=[?], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
The shapes of placeholder and the data passed are as follows:
placeholder: (?, ?, 50)
data: (1, 2, 50)
Code Involved:
Placeholder involved: self.x_placeholder_input = tf.placeholder(tf.float32, shape=[None, None, n_inputs])
The other placeholder (label): self.y_placeholder_label = tf.placeholder(tf.int32, shape=[None, self.num_of_classes])
Predict method:
def predict(self):
with tf.name_scope("predict"):
with tf.Session(graph=tf.Graph()) as sess:
saver = tf.train.import_meta_graph("saved_models/model.meta")
saver.restore(sess, "saved_models/model")
graph = tf.get_default_graph()
output = graph.get_tensor_by_name("optimize/cal_loss/model_network/model_network_NN_network/output/BiasAdd:0")
x_placeholder = graph.get_tensor_by_name("Placeholder:0")
print x_placeholder.shape
print np.array(self.data_x).shape
print sess.run(output, feed_dict={x_placeholder: self.data_x})
Train Method:
def train(self):
writer = writer = tf.summary.FileWriter("mygraph/logs", tf.get_default_graph())
num_of_epoch = 10
with tf.Session() as sess:
for epoch in range(num_of_epoch):
# initialise all variables
optimize = self.optimize
sess.run(tf.global_variables_initializer())
sess.run(optimize,
feed_dict={self.x_placeholder_input: np.array(self.data_x),
self.y_placeholder_label: np.array(self.data_y),
self.sq_placeholder_seq_length: np.array(self.seq_length)})
if num_of_epoch % 10 == 0:
# Create Saver to save model
print "Cycle " + str(epoch) + " out of " + str(num_of_epoch) + " done"
saver = tf.train.Saver()
location = saver.save(sess, "saved_models/model")
print "Model saved to : " + str(location)
Question: Is the problem due to placeholder having two None when defining it's shape? It is fine when training though.
**Full Code (if it helps): **
(https://gist.github.com/duemaster/660208e6cd7856af2522c2efa67911da)
In my experience you are getting this error because you are not feeding the value of that placeholder in the restored model. You'd normally expect to get the error "you must feed the value for placeholder XX" when you forget to feed it, but I've noticed that when placeholders have None in their shape vector (with restored models), the error would be the one you are getting about negative dimensions. I've got this error even with 1 None in the placeholder shape, and properly feeding its value solved the problem.

Why does the location of saver in the script matter when there is a graph object in TensorFlow?

I was training some models and I noticed that when I explicitly defined a graph variable, then it mattered where my saver object was created. First my code looked like this:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("tmp_MNIST_data/", one_hot=True)
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.truncated_normal([784, 10], mean=0.0, stddev=0.1),name='w')
b = tf.Variable(tf.constant(0.1, shape=[10]),name='b')
y = tf.nn.softmax(tf.matmul(x, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) # list of booleans indicating correct predictions
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_step = tf.train.GradientDescentOptimizer(0.2).minimize(cross_entropy)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(1001):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_: batch_ys})
if i % 100 == 0:
saver.save(sess=sess,save_path='./tmp/mdl_ckpt')
print(sess.run(fetches=accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
then I decided to change it to something like this and it seemed very sensitive where I defined my variables and where I defined saver. If they were not defined exactly after the graph variable was created for example, it would have errors. Similarly, I noticed that saver had to be defined exactly after one single variable (note being after the definition of the graph was not enough) for all the variables to be captured together by the saver (which didn't make sense to me, it would make more sense to require that its behind the definition of all the variables rather than a single one for this to work).
This is how the code looks now (with comments showing the locations of where I've defined saver):
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("tmp_MNIST_data/", one_hot=True)
graph = tf.Graph()
with tf.Session(graph=graph) as sess:
#saver = tf.train.Saver()
x = tf.placeholder(tf.float32, [None, 784])
saver = tf.train.Saver()
y_ = tf.placeholder(tf.float32, [None, 10])
#saver = tf.train.Saver()
W = tf.Variable(tf.truncated_normal([784, 10], mean=0.0, stddev=0.1),name='w')
#saver = tf.train.Saver()
b = tf.Variable(tf.constant(0.1, shape=[10]),name='b')
y = tf.nn.softmax(tf.matmul(x, W) + b)
#saver = tf.train.Saver()
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) # list of booleans indicating correct predictions
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
#saver = tf.train.Saver()
step = tf.Variable(0, name='step')
#saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
#saver = tf.train.Saver()
for i in range(1001):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_: batch_ys})
if i % 100 == 0:
step_assign = step.assign(i)
sess.run(step_assign)
saver.save(sess=sess,save_path='./tmp/mdl_ckpt')
print(step.eval())
print( [ op.name for op in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)] )
print(sess.run(fetches=accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
the code above should work, but I am having a hard time understand why it behaves like this or why it makes sense that this would happen. Someone know what the right thing to do is?
I'm not entirely sure what's going on here, but I suspect the issue is related to variables not going into the wrong graph, or the session having an outdated version of the graph. You create a graph, but don't set it as the default, then create a session with that graph... but when you create variables, you don't specify which graph they should go into. Maybe the creation of the session sets the specified graph to the default, but that's not the way tensorflow was designed to be used, so I wouldn't be surprised if it hasn't been thoroughly tested in this regime.
While I don't have an explanation or what's going on, I can suggest a simple solution: separate graph construction with session running.
graph = tf.Graph()
with graph.as_default():
build_graph()
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
do_stuff_with(sess)
saver.save(sess, path)

Resources