TensorFlow (Neural Network) FC output size - machine-learning

Not sure whether my question is TF specific or just NNs in general but i have created a CNN using tensorflow. and im having trouble understanding why the size of the output on my fully connected layer is what it is.
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)
# define model
def complex_model(X,y,is_training):
# conv layer
wconv_1 = tf.get_variable('wconv_1', [7 ,7 ,3, 32])
bconv_1 = tf.get_variable('bconv_1', [32])
# affine layer 1
w1 = tf.get_variable('w1', [26*26*32//4, 1024]) #LINE 13
b1 = tf.get_variable('b1', [1024])
# batchnorm params
bn_gamma = tf.get_variable('bn_gamma', shape=[32]) #scale
bn_beta = tf.get_variable('bn_beta', shape=[32] ) #shift
# affine layer 2
w2 = tf.get_variable('w2', [1024, 10])
b2 = tf.get_variable('b2', [10])
c1_out = tf.nn.conv2d(X, wconv_1, strides=[1, 1, 1, 1], padding="VALID") + bconv_1
activ_1 = tf.nn.relu(c1_out)
mean, var = tf.nn.moments(activ_1, axes=[0,1,2], keep_dims=False)
bn = tf.nn.batch_normalization(act_1, mean, var, bn_gamma, bn_beta, 1e-6)
mp = tf.nn.max_pool(bn, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
affine_in_flat = tf.reshape(mp, [-1, 26*26*32//4])
affine_1 = tf.matmul(affine_in_flat, w1) + b1
activ_2 = tf.nn.relu(affine_1)
affine_2 = tf.matmul(activ_2, w2) + b2
return affine_2
#print(affine_2.shape)
In line 13 where i set the value of w1 i would have expected to just put:
w1 = tf.get_variable('w1', [26*26*32, 1024])
however if i run the code with the line shown above and with
affine_in_flat = tf.reshape(mp, [-1, 26*26*32])
my output size is 16,10 instead of 64,10 which is what i would expect given the initialisations below:
x = np.random.randn(64, 32, 32,3)
with tf.Session() as sess:
with tf.device("/cpu:0"): #"/cpu:0" or "/gpu:0"
tf.global_variables_initializer().run()
#print("train", x.size, is_training, y_out)
ans = sess.run(y_out,feed_dict={X:x,is_training:True})
%timeit sess.run(y_out,feed_dict={X:x,is_training:True})
print(ans.shape)
print(np.array_equal(ans.shape, np.array([64, 10])))
can anybody tell me why i need to divide the size of w1[0] by 4?

Adding print statements for bn and mp I get:
bn: <tf.Tensor 'batchnorm/add_1:0' shape=(?, 26, 26, 32) dtype=float32>
mp: <tf.Tensor 'MaxPool:0' shape=(?, 13, 13, 32) dtype=float32>
Which would seem to be due to the strides=[1, 2, 2, 1] on the max pooling (but to maintain 26, 26 you'd also need padding='SAME').

Related

Tensorflow multi-GPU MNIST classifier: low accuracy

I am stuck with multiple GPU MNIST classifier in Tensorflow. Code runs without errors, but accuracy is very poor (30%). I am new to Tensorflow so I do not know where is the problem ? GPU: 2x GTX 1080 Ti.
I have found several tutorials for multiple GPU, but code is hard to follow. For this reason I am trying to develop MNIST CNN classifier from scratch.
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import datetime
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, [None, 784], name='x')
x_img=tf.reshape(x, [-1, 28, 28, 1])
x_dict={}
x_dict['x0'],x_dict['x1'] = tf.split(x_img,2)
y_dict={}
y = tf.placeholder(tf.float32, [None, 10], name='y')
y_dict['y0'],y_dict['y1'] = tf.split(y,2)
opt=tf.train.GradientDescentOptimizer(0.01)
keep_prob = tf.placeholder(tf.float32)
w0=tf.get_variable('w0',initializer=tf.truncated_normal([5, 5,1,32], stddev=0.1))
b0=tf.get_variable('b0',initializer=tf.zeros([32]))
w1=tf.get_variable('w1',initializer=tf.truncated_normal([5,5,32,64], stddev=0.1))
b1=tf.get_variable('b1',initializer=tf.zeros([64]))
w2=tf.get_variable('w2',initializer=tf.truncated_normal([7*7*64,1024], stddev=0.1))
b2=tf.get_variable('b2',initializer=tf.zeros([1024]))
w3=tf.get_variable('w3',initializer=tf.truncated_normal([1024,10], stddev=0.1))
b3=tf.get_variable('b3',initializer=tf.zeros([10]))
grads=[]
def conv2d(xx, W):
return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(xx):
return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1],strides=[1, 2, 2, 1], padding='SAME')
def model_forward(xx):
h_conv1=tf.nn.relu(conv2d(xx,w0)+b0);
h_pool1=max_pool_2x2(h_conv1)
h_conv2=tf.nn.relu(conv2d(h_pool1,w1)+b1);
h_pool2=max_pool_2x2(h_conv2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,w2)+b2)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y = tf.nn.sigmoid(tf.matmul(h_fc1_drop,w3)+b3)
return y
for i in range(0,2):
with tf.device(('/gpu:{0}').format(i)):
with tf.variable_scope(('scope_gpu_{0}').format(i)):
yy=model_forward(x_dict[('x{0}').format(i)])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1]))
grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables()))
with tf.device('/cpu:0'):
grad = average_gradients(grads)
train_step = opt.apply_gradients(grad)
yy=model_forward(x_dict['x0'])
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y_dict['y0'], 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
def main():
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter('C:\\tmp\\test\\', graph=tf.get_default_graph())
t1_1 = datetime.datetime.now()
for step in range(0,10000):
batch_x, batch_y = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5})
if (step % 200) == 0:
print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1}))
t2_1 = datetime.datetime.now()
print("Computation time: " + str(t2_1-t1_1))
if __name__ == "__main__":
main()
The problems that I noticed:
Your cross-entropy loss is wrong (see this question for details, in short you're computing binary cross-entropy).
I dropped manual gradient computation in favor of tf.train.AdamOptimizer.
I dropped the split of the input of x (it's not the right way to do distributed computation in tensorflow).
The result model easily gets to 99% accuracy even on one GPU.
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import datetime
x = tf.placeholder(tf.float32, [None, 784], name='x')
x_img = tf.reshape(x, [-1, 28, 28, 1])
y = tf.placeholder(tf.float32, [None, 10], name='y')
keep_prob = tf.placeholder(tf.float32)
stddev = 0.1
w0 = tf.get_variable('w0', initializer=tf.truncated_normal([5, 5, 1, 32], stddev=stddev))
b0 = tf.get_variable('b0', initializer=tf.zeros([32]))
w1 = tf.get_variable('w1', initializer=tf.truncated_normal([5, 5, 32, 64], stddev=stddev))
b1 = tf.get_variable('b1', initializer=tf.zeros([64]))
w2 = tf.get_variable('w2', initializer=tf.truncated_normal([7 * 7 * 64, 1024], stddev=stddev))
b2 = tf.get_variable('b2', initializer=tf.zeros([1024]))
w3 = tf.get_variable('w3', initializer=tf.truncated_normal([1024, 10], stddev=stddev))
b3 = tf.get_variable('b3', initializer=tf.zeros([10]))
def conv2d(xx, W):
return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(xx):
return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def model_forward(xx):
h_conv1 = tf.nn.relu(conv2d(xx, w0) + b0)
h_pool1 = max_pool_2x2(h_conv1)
h_conv2 = tf.nn.relu(conv2d(h_pool1, w1) + b1)
h_pool2 = max_pool_2x2(h_conv2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w2) + b2)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y = tf.matmul(h_fc1_drop, w3) + b3
return y
yy = model_forward(x_img)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=yy, labels=y))
train_step = tf.train.AdamOptimizer().minimize(loss)
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
def main():
mnist = input_data.read_data_sets("/home/maxim/p/data/mnist-tf", one_hot=True)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.run(tf.global_variables_initializer())
t1_1 = datetime.datetime.now()
for step in range(0, 10000):
batch_x, batch_y = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5})
if (step % 200) == 0:
print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1}))
t2_1 = datetime.datetime.now()
print("Computation time: " + str(t2_1 - t1_1))
if __name__ == "__main__":
main()
Now, if you really want it, you can do data or model parallelism to utilize your GPU power (there is a great post about it, but sometimes it doesn't render correctly due to hosting problems).
Along with the points mentioned in the first two answers, take a look at return average_grads in average_gradients function, it's returning from the 1st iteration of the first for loop, meaning the gradients will only apply to the first variable (probably w0). Hence only w0 is getting updated and so you are getting a very low accuracy since the rest of the variables stay to their original values (either random/zeros).
This is because the model is not using the same weights & biases for inference on CPU as well as on the other GPU devices.
For example:
for i in range(0,2):
with tf.device(('/gpu:{0}').format(i)):
with tf.variable_scope(('scope_gpu_{0}').format(i)) as infer_scope:
yy=model_forward(x_dict[('x{0}').format(i)])
infer_scope.reuse_variables()
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1]))
grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables()))
The reason you are getting low accuracy is that without specifying reuse_variables() and you try to call the model inference inside each epoch, the graph would create a new model with random weights & biases initialization, which is not what you favored.

How to interpret the loss function in the categorical generative adversarial net?

So I've been implementing the categorical generative adversarial networks which is described in here.
[Jost T. Springenberg. Unsupervised and semi-supervised learning with
categorical generative adversarial networks, April 2016.]
formula
This is the loss function introduced in page 6 and the thing is that the formula uses arg_max which is odd because most of the optimizers I can use on various frameworks such as Tensorflow only work in arg_min.
So would you please guys tell me how to implement this formula?
Here is the code I implemented.
import tensorflow as tf
import numpy as np
import PIL.Image as Image
# constants
X_dim = 256
Y_dim = 2
Z_dim = 256 * 256
value_lambda = 1.0
X = tf.placeholder(tf.float32, shape=[None, X_dim, X_dim, 1])
Y = tf.placeholder(tf.float32, shape=[None, Y_dim])
Z = tf.placeholder(tf.float32, shape=[None, Z_dim])
initializer = tf.contrib.layers.variance_scaling_initializer
activation_function = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(0.5)
custom_filter = np.ones(shape=[32, 256, 256, 1], dtype=np.float)
custom_filter[:, 255, :, :] = 0
custom_filter[:, :, 255, :] = 0
custom_filter = tf.constant(custom_filter, dtype=tf.float32)
def discriminator(x, name=None):
with tf.name_scope(name, "discriminator", [x]) as scope:
D_conv_1 = tf.layers.conv2d(inputs=x, filters=16, kernel_size=[
5, 5], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [256, 256]
D_mean_pool_1 = tf.nn.pool(D_conv_1, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [128, 128]
D_conv_2 = tf.layers.conv2d(D_mean_pool_1, filters=32, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [128, 128]
D_mean_pool_2 = tf.nn.pool(D_conv_2, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [64, 64]
D_conv_3 = tf.layers.conv2d(D_mean_pool_2, filters=64, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [64, 64]
D_mean_pool_3 = tf.nn.pool(D_conv_3, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [32, 32]
D_conv_4 = tf.layers.conv2d(D_mean_pool_3, filters=128, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [32, 32]
D_mean_pool_4 = tf.nn.pool(D_conv_4, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [16, 16]
D_conv_5 = tf.layers.conv2d(D_mean_pool_4, filters=256, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [16, 16]
D_mean_pool_5 = tf.nn.pool(D_conv_5, window_shape=[
4, 4], pooling_type='AVG', padding='VALID', strides=[4, 4])
# [4, 4]
D_conv_6 = tf.layers.conv2d(D_mean_pool_5, filters=2, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [4, 4]
D_mean_pool_6 = tf.nn.pool(D_conv_6, window_shape=[
4, 4], pooling_type='AVG', padding='VALID', strides=[4, 4])
# [1, 1], and finally, [batch_size][1][1][2]
D_logit = tf.reshape(D_mean_pool_6, shape=[32, 2])
# [batch_size][2]
return D_logit
'''
D_hidden_layer_1 = tf.layers.dense(
inputs=x, units=255, activation=activation_function)
D_hidden_layer_2 = tf.layers.dense(
inputs=D_hidden_layer_1, units=16, activation=activation_function)
D_logit = tf.layers.dense(inputs=D_hidden_layer_2, units=Y_dim,
activation=activation_function)
return D_logit
'''
def generator(z, name=None):
with tf.name_scope(name, "generator", [z]) as scope:
# z[32, 4096]
input = tf.reshape(z, shape=[32, 256, 256, 1])
# input[32, 64, 64, 1]
G_conv_1 = tf.layers.conv2d(input, filters=96, kernel_size=[
8, 8], padding='SAME', activation=activation_function)
# [32, 64, 64, 96]
# G_upscaled_1 = tf.image.resize_bicubic(images=G_conv_1, size=[128, 128])
# [32, 128, 128, 96]
G_conv_2 = tf.layers.conv2d(G_conv_1, filters=64, kernel_size=[
5, 5], padding='SAME', activation=activation_function)
# [32, 128, 128, 64]
# G_upscaled_2 = tf.image.resize_bicubic(G_conv_2, size=[256, 256])
# [32, 256, 256, 64]
G_conv_3 = tf.layers.conv2d(G_conv_2, filters=64, kernel_size=[
5, 5], padding='SAME', activation=activation_function)
# [32, 256, 256, 64]
G_conv_4 = tf.layers.conv2d(G_conv_3, filters=1, kernel_size=[
5, 5], padding='SAME', activation=activation_function)
# [32, 256, 256, 1]
G_logit = G_conv_4 * custom_filter
# [32, 256, 256, 1], but filtered out the last column and row
return G_logit
'''
G_hidden_layer_1 = tf.layers.dense(
inputs=z, units=255, activation=activation_function)
G_outputs = tf.layers.dense(inputs=G_hidden_layer_1, units=X_dim,
activation=activation_function)
return G_outputs
'''
with tf.name_scope("training") as scope:
# Getting samples from random data
G_sample = generator(Z)
# Getting logits
D_logit_real = discriminator(X)
D_logit_fake = discriminator(G_sample)
# Applying softmax
D_proba_real = tf.nn.softmax(logits=D_logit_real)
D_proba_real = tf.clip_by_value(
D_proba_real, clip_value_min=1e-4, clip_value_max=1.0)
D_proba_fake = tf.nn.softmax(logits=D_logit_fake)
D_proba_fake = tf.clip_by_value(
D_proba_fake, clip_value_min=1e-4, clip_value_max=1.0)
with tf.name_scope("category_1") as sub_scope:
# Getting Shannon's entrophy in X's distribution
D_log_real = tf.log(D_proba_real)
D_entrophy_real = D_proba_real * D_log_real
D_mean_real = tf.reduce_sum(D_entrophy_real, axis=1)
D_mean_real = -D_mean_real
D_entrophy_real_mean = tf.reduce_mean(D_mean_real, axis=0)
D_entrophy_real_mean = tf.reshape(D_entrophy_real_mean, shape=[1])
with tf.name_scope("category_2") as sub_scope:
# Gettning Shannon's entrophy in Z's distribution
G_log_fake = tf.log(D_proba_fake)
G_entrophy_fake = D_proba_fake * G_log_fake
G_mean = tf.reduce_sum(G_entrophy_fake, axis=1)
G_mean = -G_mean
G_entrophy_fake_mean = tf.reduce_mean(G_mean, axis=0)
G_entrophy_fake_mean = tf.reshape(G_entrophy_fake_mean, shape=[1])
with tf.name_scope("category_3") as sub_scope:
# Getting Shannon's entrophy between classes
D_class_mean = tf.reduce_mean(D_proba_real, axis=0, keep_dims=True)
D_class_mean_log = tf.log(D_class_mean)
D_class_entropy = D_class_mean * D_class_mean_log
D_class = tf.reduce_sum(D_class_entropy, axis=1)
D_class = -D_class
D_class = tf.reshape(D_class, shape=[1])
G_class_mean = tf.reduce_mean(D_proba_fake, axis=0, keep_dims=True)
G_class_mean_log = tf.log(G_class_mean)
G_class_entrophy = G_class_mean * G_class_mean_log
G_class = tf.reduce_sum(G_class_entrophy, axis=1)
G_class = -G_class
G_class = tf.reshape(G_class, shape=[1])
with tf.name_scope("supervised") as sub_scope:
# Getting cross entrophy for labeled data
D_labeled = Y * D_log_real
D_cross_entrophy = tf.reduce_sum(D_labeled, axis=1)
D_cross_entrophy = -D_cross_entrophy
D_supervised = tf.reduce_mean(D_cross_entrophy, axis=0)
D_supervised_weighted = value_lambda * D_supervised
D_supervised_weighted = tf.reshape(D_supervised_weighted, shape=[1])
D_loss = D_class - D_entrophy_real_mean + \
G_entrophy_fake_mean + D_supervised_weighted
G_loss = -G_class + G_entrophy_fake_mean
D_loss = -D_loss
D_solver = tf.train.AdamOptimizer().minimize(D_loss)
G_solver = tf.train.AdamOptimizer().minimize(G_loss)
# with tf.name_scope("testing") as scope:
I've done some research and asked some questions to my friends who works in a big company doing research on deep learning. As it turns out that the generative adversarial networks is not good at classification jobs. So I changed my mind and implemented it with GoogLenet. Problem solved!

CNN for cifar10 dataset in Tensorflow

I am trying to replicate results obtained by a convolutional neural network for CIFAR10 using Tensorflow, however after some epochs (~60 epochs) my performance (accuracy) is around 10%, so I do not if the CNN is well trained?
This code is based on Deep mnist for experts https://www.tensorflow.org/get_started/mnist/pros , however in Cifar10 it does not work
import numpy as np
import tensorflow as tf
def unpickle(file):
import cPickle
fo = open(file, 'rb')
dict = cPickle.load(fo)
fo.close()
return dict
#unpacking training and test data
b1 = unpickle("~/cifar-10-batches-py/data_batch_1")
b2 = unpickle("~/cifar-10-batches-py/data_batch_2")
b3 = unpickle("~/cifar-10-batches-py/data_batch_3")
b4 = unpickle("~/cifar-10-batches-py/data_batch_4")
b5 = unpickle("~/cifar-10-batches-py/data_batch_5")
test = unpickle("~/cifar-10-batches-py/test_batch")
#Preparing test data
test_data = test['data']
test_label = test['labels']
#Preparing training data
train_data = np.concatenate([b1['data'],b2['data'],b3['data'],b4['data'],b5['data']],axis=0)
train_label = np.concatenate([b1['labels'],b2['labels'],b3['labels'],b4['labels'],b5['labels']],axis=0)
#Reshaping data
train_data = np.reshape(train_data,[50000,32,32,3])
test_data = np.reshape(test_data,[10000,32,32,3])
batch_size = 100
image_width = 32
image_height = 32
channels = 3
#Constructing Graph
x = tf.placeholder(tf.float32, [None, image_width, image_height, channels])#Training Data
y = tf.placeholder(tf.int32, [None])
one_hot = tf.one_hot(y,depth=10)#Converting in one hot vectors
#Constructing CNN Layers
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
#Given an input tensor of shape [batch, in_height, in_width, in_channels] and a filter / kernel tensor of shape [filter_height, filter_width, in_channels, out_channels], taken from: http://textminingonline.com/dive-into-tensorflow-part-v-deep-mnist
W_conv1 = weight_variable([7, 7, 3, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 32])
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_conv3 = weight_variable([5, 5, 32, 64])
b_conv3 = bias_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
#Constructing MLP layers
W_fc1 = weight_variable([8 * 8 * 64, 64])
b_fc1 = bias_variable([64])
h_pool3_flat = tf.reshape(h_conv3, [-1, 8*8*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
W_fc2 = weight_variable([64, 10])
b_fc2 = bias_variable([10])
y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
#Computing Cost function
cross_entropy = -tf.reduce_sum(one_hot*tf.log(tf.clip_by_value(y_conv,1e-10,1e20)))
train_step = tf.train.MomentumOptimizer(learning_rate = 0.0001, momentum = 0.9).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(one_hot,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.initialize_all_variables()
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=16))
sess.run(init)
epochs = 100
b_per = 0
row = []
for e in range(epochs):
print( "epoch", e)
avg_cost = 0
#foreach batch
for j in range(int(train_data.shape[0]/batch_size)):
subset=range((j*batch_size),((j+1)*batch_size))
data = train_data[subset,:,:,:]
label = train_label[subset]
_,c = sess.run([train_step,cross_entropy], feed_dict={x: data, y: label})
avg_cost += c / data.shape[0]
#print(avg_cost)
b_per = b_per + 1
if b_per%10==0 :
row.append(sess.run(accuracy, feed_dict={x: test_data, y: test_label }))
print(row[-1])
It is wrong in data reshape part! It should be,
# Reshaping data
train_data = train_data.reshape(50000, 3, 32, 32).transpose(
0, 2, 3, 1).astype("uint8")
test_data = test_data.reshape(10000, 3, 32, 32).transpose(
0, 2, 3, 1).astype("uint8")

Tensorflow shuffle batch fraction unexpected behavior

I am training a convolutional neural network and I got some unexpected behavior with the shuffle_batch fraction summary, or maybe I just do not understand it. Can someone pls explain it? The difference between those two graphs is that I exchanged the loss function.
With this loss function I get the line at 0.0
loss = tf.nn.l2_loss(expected_labels-labels)
While this one gives me a constant 1.0 (after hitting 1.0 the first time)
loss = tf.reduce_mean(tf.square(expected_labels - labels))
Can the change of loss function really cause that change? I am not sure what this means.
EDIT: Code as requested
The first part is for setting up the batching and the big picture.
filename_queue = tf.train.string_input_producer(filenames,
num_epochs=None)
label, image = read_and_decode_single_example(filename_queue=filename_queue)
image = tf.image.decode_jpeg(image.values[0], channels=3)
jpeg = tf.cast(image, tf.float32) / 255.
jpeg.set_shape([66,200,3])
images_batch, labels_batch = tf.train.shuffle_batch(
[jpeg, label], batch_size= FLAGS.batch_size,
num_threads=8,
capacity=60000,
min_after_dequeue=10000)
images_placeholder, labels_placeholder = placeholder_inputs(
FLAGS.batch_size)
label_estimations, W1_conv, h1_conv, current_images = e2e.inference(images_placeholder)
# Add to the Graph the Ops for loss calculation.
loss = e2e.loss(label_estimations, labels_placeholder)
# Decay once per epoch, using an exponential schedule starting at 0.01.
# Add to the Graph the Ops that calculate and apply gradients.
train_op = e2e.training(loss, FLAGS.learning_rate, FLAGS.batch_size)
Here come the methods for inference loss and train
def inference(images):
with tf.name_scope('conv1'):
W_conv1 = tf.Variable(tf.truncated_normal([5, 5, 3, FEATURE_MAPS_C1], stddev=STDDEV))
b_conv1 = tf.Variable(tf.constant(BIAS_INIT, shape=[FEATURE_MAPS_C1]))
h_conv1 = tf.nn.bias_add(
tf.nn.conv2d(images, W_conv1, strides=[1, 2, 2, 1], padding='VALID'), b_conv1)
with tf.name_scope('conv2'):
W_conv2 = tf.Variable(tf.truncated_normal([5, 5, FEATURE_MAPS_C1, 36], stddev=STDDEV))
b_conv2 = tf.Variable(tf.constant(BIAS_INIT, shape=[36]))
h_conv2 = tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding='VALID') + b_conv2
with tf.name_scope('conv3'):
W_conv3 = tf.Variable(tf.truncated_normal([5, 5, 36, 48], stddev=STDDEV))
b_conv3 = tf.Variable(tf.constant(BIAS_INIT, shape=[48]))
h_conv3 = tf.nn.conv2d(h_conv2, W_conv3, strides=[1, 2, 2, 1], padding='VALID') + b_conv3
with tf.name_scope('conv4'):
W_conv4 = tf.Variable(tf.truncated_normal([3, 3, 48, 64], stddev=STDDEV))
b_conv4 = tf.Variable(tf.constant(BIAS_INIT, shape=[64]))
h_conv4 = tf.nn.conv2d(h_conv3, W_conv4, strides=[1, 1, 1, 1], padding='VALID') + b_conv4
with tf.name_scope('conv5'):
W_conv5 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=STDDEV))
b_conv5 = tf.Variable(tf.constant(BIAS_INIT, shape=[64]))
h_conv5 = tf.nn.conv2d(h_conv4, W_conv5, strides=[1, 1, 1, 1], padding='VALID') + b_conv5
h_conv5_flat = tf.reshape(h_conv5, [-1, 1 * 18 * 64])
with tf.name_scope('fc1'):
W_fc1 = tf.Variable(tf.truncated_normal([1 * 18 * 64, 100], stddev=STDDEV))
b_fc1 = tf.Variable(tf.constant(BIAS_INIT, shape=[100]))
h_fc1 = tf.matmul(h_conv5_flat, W_fc1) + b_fc1
with tf.name_scope('fc2'):
W_fc2 = tf.Variable(tf.truncated_normal([100, 50], stddev=STDDEV))
b_fc2 = tf.Variable(tf.constant(BIAS_INIT, shape=[50]))
h_fc2 = tf.matmul(h_fc1, W_fc2) + b_fc2
with tf.name_scope('fc3'):
W_fc3 = tf.Variable(tf.truncated_normal([50, 10], stddev=STDDEV))
b_fc3 = tf.Variable(tf.constant(BIAS_INIT, shape=[10]))
h_fc3 = tf.matmul(h_fc2, W_fc3) + b_fc3
with tf.name_scope('fc4'):
W_fc4 = tf.Variable(tf.truncated_normal([10, 1], stddev=STDDEV))
b_fc4 = tf.Variable(tf.constant(BIAS_INIT, shape=[1]))
h_fc4 = tf.matmul(h_fc3, W_fc4) + b_fc4
return h_fc4
Here is the loss function, using l2 causes the issue.
def loss(label_estimations, labels):
n_labels = tf.reshape(label_estimations, [-1])
# Here are the two loss functions
#loss = tf.reduce_mean(tf.square(n_labels - labels))
loss = tf.nn.l2_loss(n_labels-labels)
return loss
Train method:
def training(loss, learning_rate, batch_size):
global_step = tf.Variable(0, name='global_step', trainable=False)
tf.scalar_summary('learning_rate',learning_rate)
tf.scalar_summary('Loss ('+loss.op.name+')', loss)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
Plot for tf.reduce_sum(tf.square(n_labels - labels)/2)
As mentioned in TensorFlow's original guide https://www.tensorflow.org/programmers_guide/reading_data
How many threads do you need? the tf.train.shuffle_batch* functions add a summary to the graph that indicates how full the example queue is. If you have enough reading threads, that summary will stay above zero. You can view your summaries as training progresses using TensorBoard.
It seems better if the queue is never empty, i.e. the "fraction_full" stays non-zero. If not, you should allocate more threads to queue_runner
The only difference between your loss and l2 is scaling, thus you might need to play around with your learning rate / other hyperparameters to take this into account.
l2 loss in TF is defined as:
1/2 SUM_i^N (pred(x_i) - y_i)^2
while your cost is
1/N SUM_i^N (pred(x_i) - y_i)^2
Of course since you are using stochastic gradient approach, efficienty you are using an approximator of form
1/2 SUM_{(x_i, y_i) in batch} (pred(x_i) - y_i)^2 # l2
1/#batch SUM_{(x_i, y_i) in batch} (pred(x_i) - y_i)^2 # you
Thus you would have to multiply your cost by batch_size / 2 to get the original cost. Typically this is not a problem, but sometimes wrong scaling can put you in very degenerated parts of the error surface, and the optimizer will simply fail (especially such aggressive one like Adam).
Side note - you are aware that your model is a deep linear model? You do not have any non-linearities in the model. This is very specific network.

Why is my convolution autoencoder not getting trained properly?

Why is my convolutional autoencoder not converging properly? I have a very simple layer stack.
Encoder: Conv/ReLU(Kernel size: 7x7, stride = 1, padding = 0) => maxPool(kernel size=2x2, stride = 2) => Conv/ReLU(Kernel size: 5x5, stride = 1, padding = 0) => MaxPool(kernel size=2x2, stride = 2)
Decoder: Nearest Neighbour Upsampling => Deconv/ReLU => Nearest Neighbour Upsampling => Deconv/ReLU
Training Images are of size 30x30x1.
I tried to train it with 1000 images over 1000 epoch, but the error (MSE) is still 120.
BATCH_SIZE = 100
IMAGE_SIZE = 30
NUM_CHANNELS = 1
num_images = 1000
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
def encoder(X, w, w2, wd, wd2):
l1a = tf.nn.relu(tf.nn.conv2d(X, w,strides=[1, 1, 1, 1], padding='VALID'))
l1 = tf.nn.max_pool(l1a, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
l2a = tf.nn.relu(tf.nn.conv2d(l1, w2,strides=[1, 1, 1, 1], padding='VALID'))
l2 = tf.nn.max_pool(l2a, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
l1da = tf.image.resize_images(l2, 8, 8, 1, align_corners=False)
output_shapel1d = tf.convert_to_tensor([BATCH_SIZE, 12, 12, 32], dtype=tf.int32);
l1d = tf.nn.relu(tf.nn.conv2d_transpose(l1da, wd, output_shapel1d, strides=[1, 1, 1, 1], padding='VALID'))
l2da = tf.image.resize_images(l1d, 24, 24, 1, align_corners=False)
output_shapel2d = tf.convert_to_tensor([BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS], dtype=tf.int32);
l2d = tf.nn.relu(tf.nn.conv2d_transpose(l2da, wd2, output_shapel2d, strides=[1, 1, 1, 1], padding='VALID'))
return l2d
complete_image = extract_data(0, 1000)
trX = complete_image[0:900]
trY = trX
teX = complete_image[900:1000]
teY = teX
X = tf.placeholder("float", [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS])
Y = tf.placeholder("float", [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS])
w = init_weights([7, 7, 1, 32])
w2 = init_weights([5, 5, 32, 64])
wd = init_weights([5, 5, 32, 64])
wd2 = init_weights([7, 7, 1, 32])
py_x = encoder(X, w, w2, wd, wd2)
cost = tf.reduce_mean(tf.squared_difference(py_x, Y, name = None))
train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
predict_op = py_x;
global_step = tf.Variable(0, name='global_step', trainable=False)
saver = tf.train.Saver()
with tf.Session() as sess:
tf.initialize_all_variables().run()
start = global_step.eval() # get last global_step
print "Start from:", start
if FLAGS.output == "train":
for i in range(start, 500):
training_batch = zip(range(0, num_images - BATCH_SIZE, batch_size),
range(batch_size, num_images - BATCH_SIZE, batch_size))
for start, end in training_batch:
sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})
total_epoch_cost += sess.run(cost, feed_dict={X: trX[start:end], Y: trY[start:end]})
avg_epoch_cost = total_epoch_cost/BATCH_SIZE
print "cost during epoch " + `i` + "is ", avg_epoch_cost
I have added the complete code in this gist with slight modifications. I am training this with around 10,000 images, and the error after 488 epochs is 74.8.

Resources