When introducing the Optimizer variables under variable_scope get recreated twice, why? - machine-learning

As the title suggests, when looking at the graph inside tensorboard - the variables I've created inside a variable_scope get recreated twice. Why is that? what am I doing wrong?
def weights_biases(weights_shape, biases_shape):
weights = tf.get_variable("weights", weights_shape, initializer = tf.random_normal_initializer())
biases = tf.get_variable("biases", biases_shape, initializer = tf.random_normal_initializer())
return weights, biases
def hl_relu(input_tensor, weights_shape, biases_shape):
weights, biases = weights_biases(weights_shape, biases_shape)
regression = tf.matmul(input_tensor, weights) + biases
return tf.nn.relu(regression)
def neural_network_model(x):
# W = tf.Variable(
# tf.truncated_normal([vocab_size, embedding_size], stddev=1 / math.sqrt(vocab_size)),
# name="W")
# embedded = tf.nn.embedding_lookup(W, x)
# embedding_aggregated = tf.reduce_sum(embedded, [1])
with tf.variable_scope("hidden_layer_1"):
relu1 = hl_relu(x, [max_words_len, n_nodes_hl1], [n_nodes_hl1])
with tf.variable_scope("hidden_layer_2"):
relu2 = hl_relu(relu1, [n_nodes_hl1,n_nodes_hl2], [n_nodes_hl2])
with tf.variable_scope("hidden_layer_3"):
relu3 = hl_relu(relu2, [n_nodes_hl2,n_nodes_hl3], [n_nodes_hl3])
with tf.variable_scope("output_layer"):
weights, biases = weights_biases([n_nodes_hl3, n_classes], [n_classes])
output_regression = tf.matmul(relu3, weights) + biases
return output_regression
def train_neural_network(test_x, test_y):
with tf.device("/cpu:0"):
custom_runner = CustomRunner()
x_batch, y_batch = custom_runner.get_inputs()
with tf.variable_scope("test"):
testX = tf.constant(test_x, name="testX")
testX = tf.cast(testX, tf.float32)
testY = tf.constant(test_y, name="testY")
testY = tf.cast(testY, tf.float32)
with tf.variable_scope("nn") as scope:
global_step = tf.Variable(0, trainable=False, name='global_step')
logits = neural_network_model(x_batch)
scope.reuse_variables()
test_logits = neural_network_model(testX)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y_batch), name="cost")
tf.scalar_summary('cost', cost)
optimizer = tf.train.AdagradOptimizer(0.01).minimize(cost, global_step = global_step)
Produces the following abnormality:
You can see that the 'nn' scope was created twice with my hidden layers which are not connected to any inputs, but affect the Adagrad optimizer by providing their initialized random weight constantly. I suspect this is also slowing down training.
What have i done wrong in my code??

I believe this is what is causing them to be created twice.
logits = neural_network_model(x_batch)
scope.reuse_variables()
test_logits = neural_network_model(testX)
Can you change that to this:
logits = neural_network_model(x_batch)
without the test_logits and see if you still get the same issue?

Are you sure the variables are re-created? I suspect what you see is just Adagrad variables created by the optimizer so that is can save what it needs for its computation. Can you try the simplest GradientDescentOptimizer and see if it still happens?

Related

How to solve my problem of max_step parameter in pytorch?

I'm trying to train source code.
class mymodel(pl.LightningModule):
def __init__(self, config , learning_rate = 1e-4, max_steps = 100000//2):
super(mymodel, self).__init__()
self.config = config
self.save_hyperparameters()
self.training_losses = []
self.validation_losses = []
self.max_steps = max_steps
def configure_optimizers(self):
return torch.optim.AdamW(self.parameters(), lr = self.hparams['learning_rate'])
def forward(self, batch_dict):
return answer_vector
def calculate_metrics(self, prediction, labels):
batch_size = len(prediction)
ac_score = 0
for (pred, gt) in zip(prediction, labels):
ac_score+= calculate_acc_score(pred.detach().cpu(), gt.detach().cpu())
ac_score = ac_score/batch_size
return ac_score
def training_step(self, batch, batch_idx):
answer_vector = self.forward(batch)
loss = nn.CrossEntropyLoss()(answer_vector.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(answer_vector, dim = -1)
train_acc = self.calculate_metrics(preds, batch['answer'])
train_acc = torch.tensor(train_acc)
return loss
def validation_step(self, batch, batch_idx):
logits = self.forward(batch)
loss = nn.CrossEntropyLoss()(logits.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(logits, dim = -1)
## Validation Accuracy
val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
val_acc = torch.tensor(val_acc)
## Logging
self.log('val_ce_loss', loss, prog_bar = True)
self.log('val_acc', val_acc, prog_bar = True)
return {'val_loss': loss, 'val_acc': val_acc}
def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure = None, on_tpu=False,
using_native_amp=False, using_lbfgs=False):
## Warmup for 1000 steps
if self.trainer.global_step < 1000:
lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
for pg in optimizer.param_groups:
pg['lr'] = lr_scale * self.hparams.learning_rate
## Linear Decay
else:
for pg in optimizer.param_groups:
pg['lr'] = polynomial(self.hparams.learning_rate, self.trainer.global_step, max_iter = self.max_steps)
optimizer.step(opt_closure)
optimizer.zero_grad()
In 5'th epoch (maybe less or more) I encountered error that stop training. so I increase max_step . But when I increase max_step(max_step==100K) I have this problem with loss and acc that
loss>100 && acc==0 .I attach screen of this problem.
enter image description here
What changing I should do in source code to continue training model without this problem?
Updates:
I see. It looks like your optimizer_step is actually for "scheduler," where it messes with the AdamW learning rate. You should directly apply the scheduler to configure_optimizers function. See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html?highlight=configure_optimizers#configure-optimizers
===
old answer:
By error, do you mean the line of val_ce_loss? If yes, that's not an error. It means the val_ce_loss of the current epoch is not within the top1 of historical epochs so the checkpoint won't be saved to the disk. Please refer to the argument of save_top_k in the checkpoint callback. https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html

Reinforcement learning converges for mean loss but not for each training data

Here I show a dummy example that represents my actual problem.
My neural network (NN) receives one input and gives the probabilities for two output nodes. The code for the NN is:
class Net(torch.nn.Module):
def __init__(self, N, M):
super(Net, self).__init__()
self.fc1 = torch.nn.Linear(N, 4)
self.fc2 = torch.nn.Linear(4, 4)
self.fc3 = torch.nn.Linear(4, M)
def forward(self, x):
x = torch.sigmoid(self.fc1(x))
x = torch.sigmoid(self.fc2(x))
x = torch.softmax(self.fc3(x),0)
return x
The ABM class is our model that iteratiely sends calls to Net::forward, and based on the probabilities, chooses an action if it's the first index, increments agent_count. Inputs xx are stored in states which will be used to backward.
class ABM:
def __init__(self,_nn,_t_data):
self.nn = nn
self.iteration_n = _t_data.iteration_n
self.target_value = _t_data.target_value
def run(self):
for jj in range(self.iteration_n):
xx = self.generate_input();
self.states.append(xx); # store inputs
ys = nn.forward(xx);
action = self.draw(ys);
if (action == 0):
self.agent_count+=1
loss = self.calculateReward();
return loss;
def generate_input(self):
return torch.ones((1),requires_grad = True)
--some other attributes--
When the run is over, error is calculated as error = (target_value - agent_count)/target_value which is a value between -1 and 1.
In order to train the model, the error is applied to the probability of the first output node of NN. This is to correct the NN in a way that predicts the right probability for the first output. The code is:
class ABM:
def calculateReward(self):
error = (self.target_value - self.agent_count)/self.target_value
reward = torch.tensor((-error), requires_grad = True)
#since all states are same, we just choose the first one
state = self.states[0]
ys = nn.forward(state)
actionProb = ys[0]
action_reward = actionProb * reward
return action_reward;
--some other members--
Two parameters of iteration_n and target_value used in the ABM are defined in the training data class as:
class Train:
def __init__(self,tt , tv):
self.iteration_n = tt
self.target_value = tv
target_value =0;
iteration_n=0;
The harmony between different parts of the code is done as:
#### start optimization ####
nn = Net(1,2)
optimizer = optim.Adam(nn.parameters(), lr=0.01)
# create training data values
training_items = []
training_items.append(Train(1000,800))
training_items.append(Train(500,200))
error_record = []
for ii in range(100):
print("############ start iteration #%d ################"%ii)
for t_item in training_items:
model = ABM(nn,t_item)
loss = model.run()
optimizer.zero_grad()
loss.backward()
optimizer.step()
error_record.append(loss.item())
Now let's present the problem.
If I only define one training item as Train(/*iteration number*/1000,/*target value*/800));, the NN is optimized as expected:
however, by defining two training items, although the average error declines to zero, the error on each training data stays high:
Is there any idea how to solve this issue?
I have omitted some parts of the code here to make it more readable. The full running code is available on minimal ABM

I have a trained tensorflow model, how do i make predictions with the same?

I have trained my models by calling the 'train_neural_network' function which trains the model and I store the model, the accuracy comes to around 83%, the problem I'm facing is how do I make predictions using my saved model? Which variable to restore and how to pass the input(in batches or whole at once)?
def make_model(data,train_x):
n_nodes_hl1 = 2000
n_nodes_hl2 = 2000
n_nodes_hl3 = 2000
n_classes = 2 # No of classification
hidden_1_layer = {'weights': tf.Variable(tf.truncated_normal([len(train_x[0]), n_nodes_hl1], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl1]),name = 'biases')}
hidden_2_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl1, n_nodes_hl2], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl2]),name = 'biases')}
hidden_3_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl2, n_nodes_hl3], stddev=0.1,),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl3]),name = 'biases')}
output_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl3, n_classes], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_classes]),name = 'biases'), }
layer_1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
# now goes through an activation function - sigmoid function
layer_1 = tf.nn.relu(layer_1)
print ("Layer 1 done!!")
# input for layer 2 = result of activ_func for layer 1
layer_2 = tf.add(tf.matmul(layer_1, hidden_2_layer['weights']), hidden_2_layer['biases'])
layer_2 = tf.nn.relu(layer_2)
print ("Layer 2 done!!")
layer_3 = tf.add(tf.matmul(layer_2, hidden_3_layer['weights']), hidden_3_layer['biases'])
layer_3 = tf.nn.relu(layer_3)
print ("Layer 3 done!!")
output = tf.matmul(layer_3, output_layer['weights'],name = "output") + output_layer['biases']
return output
def train_neural_network(train_x,train_y,test_x,test_y):
tf.reset_default_graph()
with tf.name_scope('input'):
x = tf.placeholder('float', [None, len(train_x[0])],name= 'x_input')
y = tf.placeholder('float',name = 'y-input')
# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
prediction = make_model(x,train_x)
print ('model ready!!')
with tf.name_scope('pred'):
pred = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)
with tf.name_scope('cost'):
cost = tf.reduce_mean(pred)
with tf.name_scope('train'):
optimizer = tf.train.AdamOptimizer().minimize(cost,name = 'optimizer')
tf.summary.scalar("cost", cost)
n_epochs = 10
batch_size = 100
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()) # initializes our variables. Session has now begun.
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter('train/2/',
sess.graph)
test_writer = tf.summary.FileWriter('test/')
for epoch in range(n_epochs):
epoch_loss = 0 # we'll calculate the loss as we go
i = 0
while i < len(train_x):
#we want to take batches(chunks); take a slice, then another size)
start = i
end = i+batch_size
batch_x = np.array(train_x[start:end])
batch_y = np.array(train_y[start:end])
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
if i%200 == 0:
train_writer.add_summary(_, i)
epoch_loss += c
i+=batch_size
print('Epoch', epoch, 'completed out of', n_epochs, 'loss:', epoch_loss)
with tf.name_scope('accuracy'):
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
tf.summary.scalar("accuracy", accuracy)
print('Accuracy:', accuracy.eval({x: test_x, y: test_y}))
saver = tf.train.Saver()
tf_log = 'tf.log'
saver.save(sess, "model3.ckpt")
return accuracy
This is how I am making predictions, but this fails everytime:
def test_neural_network(test_x):
batch_size = 100
i = 0
batch_x = np.array(test_x[i:i+batch_size])
tf.reset_default_graph()
x = tf.placeholder('float', [len(batch_x),len(test_x[0])])
y = tf.placeholder('float',[2])
prediction = make_model(x,batch_x)
# pred1 = tf.nn.softmax(logits=prediction)
# weight = tf.get_variable("weights_3", shape=[len(batch_x),2],initializer = tf.zeros_initializer)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.restore(sess, "model3.ckpt")
p = tf.argmax(prediction,1)
print (p.eval({x:batch_x))
gives and array of shape(batch_size,2),expected values [0,1] or [1,0] but getting decimal values.
You have a problem because you launch a session on your variable « weight ». But in your case you want to know the output of your network. Try to launch a session on your last layer 😉
How do I make predictions using my saved model? Which variable to
restore and how to pass the input (in batches or whole at once)?
Several comments regarding your design. You don't have to rebuild the graph at test time, because it's saved right next to the session checkpoint. Take a look at this question.
With this, your code will be simplified a lot, because you don't have to keep the placeholders and cross-entropy loss function separately. Add the name to the softmax layer like this:
with tf.name_scope('pred'):
pred = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y, name='softmax')
After you've restored the graph, you can find the target operation by:
graph = sess.graph
pred = graph.get_operation_by_name("pred/softmax")
If your test data is not big, you can freely feed all of it at once, but if it's significantly larger than your batch size, you can easily get out-of-memory. In this case, you should use mini-batches for testing as well.
As for your test accuracy, there can be plenty reasons for this, for instance, overfitting. Update the question with the full code, so that it could be reproduced.

TensorFlow image reading queue empty

I'm trying to use the pipeline for reading images to the CNN. I used string_input_producer() to obtain the queue of file names, but it seems to hang there without doing anything. Below is my code, please give me some advise of how to make it work.
def read_image_file(filename_queue, labels):
reader = tf.WholeFileReader()
key, value = reader.read(filename_queue)
image = tf.image.decode_png(value, channels=3)
image = tf.cast(image, tf.float32)
resized_image = tf.image.resize_images(image, [224, 112])
with tf.Session() as sess:
label = getLabel(labels, key.eval())
return resized_image, label
def input_pipeline(filename_queue, queue_names, batch_size, num_epochs, labels):
image, label = read_image_file(filename_queue, labels)
min_after_dequeue = 10 * batch_size
capacity = 20 * batch_size
image_batch, label_batch = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=1, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return image_batch, label_batch
train_queue = tf.train.string_input_producer(trainnames, shuffle=True, num_epochs=epochs)
train_batch, train_label = input_pipeline(train_queue, trainnames, batch_size, epochs, labels)
prediction = AlexNet(x)
#Training
with tf.name_scope("cost_function") as scope:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=train_label, logits=prediction(train_batch)))
tf.summary.scalar("cost_function", cost)
train_step = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost)
#Accuracy
with tf.name_scope("accuracy") as scope:
correct_prediction = tf.equal(tf.argmax(prediction,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar("accuracy", accuracy)
merged = tf.summary.merge_all()
#Session
with tf.Session() as sess:
print('started')
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord, start=True)
sess.run(threads)
try:
for step in range(steps_per_epch * epochs):
print('step: %d' %step)
sess.run(train_step)
except tf.errors.OutOfRangeError as ex:
pass
coord.request_stop()
coord.join(threads)
Your code is not completely self-contained as the get_label method is not defined.
But it is very likely that the issue you have comes from these lines in the read_image_file method:
with tf.Session() as sess:
label = getLabel(labels, key.eval())
The key.eval part tries to dequeue an element of a queue which has not started yet.
You shouldn't create any session before your input pipeline is defined (nor try to eval key (and possibly labels)). The get_label method should only perform tensor operations on labels and key and return a label tensor..
For example, you can use these tensor string operations so they will be part of the graph.

Is there any difference between the two codes?

I am currently still relatively new to Tensorflow. I am having some trouble with these two pieces of code.
Code A:
self.h1_layer = tf.layers.dense(self.x, self.n_nodes_hl1, activation=tf.nn.relu, name="h1")
self.h2_layer = tf.layers.dense(self.h1_layer, self.n_nodes_hl2, activation=tf.nn.relu, name="h2")
self.h3_layer = tf.layers.dense(self.h2_layer, self.n_nodes_hl3, activation=tf.nn.relu, name="h3")
self.logits = tf.layers.dense(self.h3_layer, self.num_of_classes, name="output")
Code B:
self.hidden_1_layer = {
'weights': tf.Variable(tf.random_normal([self.num_of_words, self.h1])),
'biases' : tf.Variable(tf.random_normal([self.h1]))
}
self.hidden_2_layer = {
'weights': tf.Variable(tf.random_normal([self.h1, self.h2])),
'biases' : tf.Variable(tf.random_normal([self.h2]))
}
self.hidden_3_layer = {
'weights': tf.Variable(tf.random_normal([self.h2, self.h3])),
'biases' : tf.Variable(tf.random_normal([self.h3]))
}
self.final_output_layer = {
'weights': tf.Variable(tf.random_normal([self.h3, self.num_of_classes])),
'biases' : tf.Variable(tf.random_normal([self.num_of_classes]))
}
layer1 = tf.add(tf.matmul(data, self.hidden_1_layer['weights']), self.hidden_1_layer['biases'])
layer1 = tf.nn.relu(layer1)
layer2 = tf.add(tf.matmul(layer1, self.hidden_2_layer['weights']), self.hidden_2_layer['biases'])
layer2 = tf.nn.relu(layer2)
layer3 = tf.add(tf.matmul(layer2, self.hidden_3_layer['weights']), self.hidden_3_layer['biases'])
layer3 = tf.nn.relu(layer3)
output = tf.matmul(layer3, self.final_output_layer['weights']) + self.final_output_layer['biases']
Are they the same thing? Can both Codes A & B weights and biases be saved with tf.train.Saver() ?
Thanks
Edit:
I am facing issues using Code A to generate prediction. It seems that logits of Code A is always changing.
The full code:
import tensorflow as tf
import os
from utils import Utils as utils
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
class Neural_Network:
# Neural Network Setup
num_of_epoch = 50
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500
def __init__(self):
self.num_of_classes = utils.get_num_of_classes()
self.num_of_words = utils.get_num_of_words()
# placeholders
self.x = tf.placeholder(tf.float32, [None, self.num_of_words])
self.y = tf.placeholder(tf.int32, [None, self.num_of_classes])
with tf.name_scope("model"):
self.h1_layer = tf.layers.dense(self.x, self.n_nodes_hl1, activation=tf.nn.relu, name="h1")
self.h2_layer = tf.layers.dense(self.h1_layer, self.n_nodes_hl2, activation=tf.nn.relu, name="h2")
self.h3_layer = tf.layers.dense(self.h2_layer, self.n_nodes_hl3, activation=tf.nn.relu, name="h3")
self.logits = tf.layers.dense(self.h3_layer, self.num_of_classes, name="output")
def predict(self):
return self.logits
def make_prediction(self, query):
result = None
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver = tf.train.import_meta_graph('saved_models/testing.meta')
saver.restore(sess, 'saved_models/testing')
# for variable in tf.trainable_variables():
# print sess.run(variable)
prediction = self.predict()
pre, prediction = sess.run([self.logits, prediction], feed_dict={self.x : query})
print pre
prediction = prediction.tolist()
prediction = tf.nn.softmax(prediction)
prediction = sess.run(prediction)
print prediction
return utils.get_label_from_encoding(prediction[0])
def train(self, data):
prediction = self.predict()
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=self.y))
optimizer = tf.train.AdamOptimizer().minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter("mygraph/logs", tf.get_default_graph())
for epoch in range(self.num_of_epoch):
optimised, loss = sess.run([optimizer, cost],
feed_dict={self.x: data['values'], self.y: data['labels']})
if epoch % 1 == 0:
print("Completed Training Cycle: " + str(epoch) + " out of " + str(self.num_of_epoch))
print("Current Loss: " + str(loss))
saver = tf.train.Saver()
saver.save(sess, 'saved_models/testing')
print("Model saved")
TLDR: The operations are essentially the same but the variables creation and initialization methods are different.
If you trace the code from here, you will eventually get to a stage where the code is calling tf.get_variable to initialize variables. In your example above, since kernel_initializer and bias_initializer is not set, they will default to None and tf.zeros_initializer() respectively (see Dense API). When None is passed to tf.get_variable as an initializer, a glorot_uniform_initializer will be used:
If initializer is None (the default), the default initializer passed
in the variable scope will be used. If that one is None too, a
glorot_uniform_initializer will be used. The initializer can also be a
Tensor, in which case the variable is initialized to this value and
shape.
More on tf.get_variable can be found here.
For one case, you used a tf.random_normal initializer for both kernel weights and bias weights, but for the other, you used tf.layers.dense and will result in a glorot_uniform_initializer for kernel weights and zeros_initializer for bias weights as no parameters were passed to tf.layers.dense.
To your second question on whether they can be saved, yes they can.
As a last note, you have to be careful when using tf.Variable as it might complicate things when the scopes are not properly set.

Resources