Why is my convolution autoencoder not getting trained properly? - machine-learning

Why is my convolutional autoencoder not converging properly? I have a very simple layer stack.
Encoder: Conv/ReLU(Kernel size: 7x7, stride = 1, padding = 0) => maxPool(kernel size=2x2, stride = 2) => Conv/ReLU(Kernel size: 5x5, stride = 1, padding = 0) => MaxPool(kernel size=2x2, stride = 2)
Decoder: Nearest Neighbour Upsampling => Deconv/ReLU => Nearest Neighbour Upsampling => Deconv/ReLU
Training Images are of size 30x30x1.
I tried to train it with 1000 images over 1000 epoch, but the error (MSE) is still 120.
BATCH_SIZE = 100
IMAGE_SIZE = 30
NUM_CHANNELS = 1
num_images = 1000
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
def encoder(X, w, w2, wd, wd2):
l1a = tf.nn.relu(tf.nn.conv2d(X, w,strides=[1, 1, 1, 1], padding='VALID'))
l1 = tf.nn.max_pool(l1a, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
l2a = tf.nn.relu(tf.nn.conv2d(l1, w2,strides=[1, 1, 1, 1], padding='VALID'))
l2 = tf.nn.max_pool(l2a, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
l1da = tf.image.resize_images(l2, 8, 8, 1, align_corners=False)
output_shapel1d = tf.convert_to_tensor([BATCH_SIZE, 12, 12, 32], dtype=tf.int32);
l1d = tf.nn.relu(tf.nn.conv2d_transpose(l1da, wd, output_shapel1d, strides=[1, 1, 1, 1], padding='VALID'))
l2da = tf.image.resize_images(l1d, 24, 24, 1, align_corners=False)
output_shapel2d = tf.convert_to_tensor([BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS], dtype=tf.int32);
l2d = tf.nn.relu(tf.nn.conv2d_transpose(l2da, wd2, output_shapel2d, strides=[1, 1, 1, 1], padding='VALID'))
return l2d
complete_image = extract_data(0, 1000)
trX = complete_image[0:900]
trY = trX
teX = complete_image[900:1000]
teY = teX
X = tf.placeholder("float", [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS])
Y = tf.placeholder("float", [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS])
w = init_weights([7, 7, 1, 32])
w2 = init_weights([5, 5, 32, 64])
wd = init_weights([5, 5, 32, 64])
wd2 = init_weights([7, 7, 1, 32])
py_x = encoder(X, w, w2, wd, wd2)
cost = tf.reduce_mean(tf.squared_difference(py_x, Y, name = None))
train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
predict_op = py_x;
global_step = tf.Variable(0, name='global_step', trainable=False)
saver = tf.train.Saver()
with tf.Session() as sess:
tf.initialize_all_variables().run()
start = global_step.eval() # get last global_step
print "Start from:", start
if FLAGS.output == "train":
for i in range(start, 500):
training_batch = zip(range(0, num_images - BATCH_SIZE, batch_size),
range(batch_size, num_images - BATCH_SIZE, batch_size))
for start, end in training_batch:
sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})
total_epoch_cost += sess.run(cost, feed_dict={X: trX[start:end], Y: trY[start:end]})
avg_epoch_cost = total_epoch_cost/BATCH_SIZE
print "cost during epoch " + `i` + "is ", avg_epoch_cost
I have added the complete code in this gist with slight modifications. I am training this with around 10,000 images, and the error after 488 epochs is 74.8.

Related

TensorFlow (Neural Network) FC output size

Not sure whether my question is TF specific or just NNs in general but i have created a CNN using tensorflow. and im having trouble understanding why the size of the output on my fully connected layer is what it is.
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)
# define model
def complex_model(X,y,is_training):
# conv layer
wconv_1 = tf.get_variable('wconv_1', [7 ,7 ,3, 32])
bconv_1 = tf.get_variable('bconv_1', [32])
# affine layer 1
w1 = tf.get_variable('w1', [26*26*32//4, 1024]) #LINE 13
b1 = tf.get_variable('b1', [1024])
# batchnorm params
bn_gamma = tf.get_variable('bn_gamma', shape=[32]) #scale
bn_beta = tf.get_variable('bn_beta', shape=[32] ) #shift
# affine layer 2
w2 = tf.get_variable('w2', [1024, 10])
b2 = tf.get_variable('b2', [10])
c1_out = tf.nn.conv2d(X, wconv_1, strides=[1, 1, 1, 1], padding="VALID") + bconv_1
activ_1 = tf.nn.relu(c1_out)
mean, var = tf.nn.moments(activ_1, axes=[0,1,2], keep_dims=False)
bn = tf.nn.batch_normalization(act_1, mean, var, bn_gamma, bn_beta, 1e-6)
mp = tf.nn.max_pool(bn, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
affine_in_flat = tf.reshape(mp, [-1, 26*26*32//4])
affine_1 = tf.matmul(affine_in_flat, w1) + b1
activ_2 = tf.nn.relu(affine_1)
affine_2 = tf.matmul(activ_2, w2) + b2
return affine_2
#print(affine_2.shape)
In line 13 where i set the value of w1 i would have expected to just put:
w1 = tf.get_variable('w1', [26*26*32, 1024])
however if i run the code with the line shown above and with
affine_in_flat = tf.reshape(mp, [-1, 26*26*32])
my output size is 16,10 instead of 64,10 which is what i would expect given the initialisations below:
x = np.random.randn(64, 32, 32,3)
with tf.Session() as sess:
with tf.device("/cpu:0"): #"/cpu:0" or "/gpu:0"
tf.global_variables_initializer().run()
#print("train", x.size, is_training, y_out)
ans = sess.run(y_out,feed_dict={X:x,is_training:True})
%timeit sess.run(y_out,feed_dict={X:x,is_training:True})
print(ans.shape)
print(np.array_equal(ans.shape, np.array([64, 10])))
can anybody tell me why i need to divide the size of w1[0] by 4?
Adding print statements for bn and mp I get:
bn: <tf.Tensor 'batchnorm/add_1:0' shape=(?, 26, 26, 32) dtype=float32>
mp: <tf.Tensor 'MaxPool:0' shape=(?, 13, 13, 32) dtype=float32>
Which would seem to be due to the strides=[1, 2, 2, 1] on the max pooling (but to maintain 26, 26 you'd also need padding='SAME').

Model in Tensorflow is not Working need review of the code not sure whats going Wrong

i am modifying the deep mnist code for my own data. i modified a model a bit but i am facing some basic issues like i pass data to my model one by one and it runs reall fast but when i pass my model all examples at ones it gets really slow and i also getting 0% accuracy. Kindly review my code i am doing something horribly wrong but i do not know where and what steps should i follow to make it correct.
Here is my model
def deepnn(x):
"""deepnn builds the graph for a deep net for classifying digits.
Args:
x: an input tensor with the dimensions (N_examples, 784), where 784 is the
number of pixels in a standard MNIST image.
Returns:
A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values
equal to the logits of classifying the digit into one of 10 classes (the
digits 0-9). keep_prob is a scalar placeholder for the probability of
dropout.
"""
x_image = tf.reshape(x, [-1, 28, 28, 1])
W_conv1 = weight_variable([5, 5, 1, 200])
b_conv1 = bias_variable([200])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 200, 100])
b_conv2 = bias_variable([100])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_fc1 = weight_variable([7 * 7 * 100, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*100])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = weight_variable([1024, 19])
b_fc2 = bias_variable([19])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
return y_conv, keep_prob
Here are the fucntion my model calls.
def conv2d(x, W):
"""conv2d returns a 2d convolution layer with full stride."""
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
"""max_pool_2x2 downsamples a feature map by 2X."""
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
def weight_variable(shape):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
and this is my main
def main(_):
x = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.float32, [None, 19])
y_conv, keep_prob = deepnn(x)
cross_entropy tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(34670):
#batch = mnist.train.next_batch(50)
if i % 1000 == 0:
train_accuracy = accuracy.eval(feed_dict={x: np.reshape(input_to_nn(i),(-1,784)), y_:np.reshape(output_of_nn(i),(-1,19)), keep_prob: 1.0})
print('step %d, training accuracy %g' % (i, train_accuracy))
train_step.run(feed_dict={x: np.reshape(input_to_nn(i),(-1,784)), y_:np.reshape(output_of_nn(i),(-1,19)), keep_prob: 0.5})
print('test accuracy %g' % accuracy.eval(feed_dict={x:input_nn, y_:output_nn, keep_prob: 1.0}))
I think that the problem is in these lines:
W_fc2 = weight_variable([1024, 19])
b_fc2 = bias_variable([19])
Your model trains to predict 19 classes. Normally there are 10 digit, if you don't really have images with 19 classes, better revert the values to original 10.

How to interpret the loss function in the categorical generative adversarial net?

So I've been implementing the categorical generative adversarial networks which is described in here.
[Jost T. Springenberg. Unsupervised and semi-supervised learning with
categorical generative adversarial networks, April 2016.]
formula
This is the loss function introduced in page 6 and the thing is that the formula uses arg_max which is odd because most of the optimizers I can use on various frameworks such as Tensorflow only work in arg_min.
So would you please guys tell me how to implement this formula?
Here is the code I implemented.
import tensorflow as tf
import numpy as np
import PIL.Image as Image
# constants
X_dim = 256
Y_dim = 2
Z_dim = 256 * 256
value_lambda = 1.0
X = tf.placeholder(tf.float32, shape=[None, X_dim, X_dim, 1])
Y = tf.placeholder(tf.float32, shape=[None, Y_dim])
Z = tf.placeholder(tf.float32, shape=[None, Z_dim])
initializer = tf.contrib.layers.variance_scaling_initializer
activation_function = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(0.5)
custom_filter = np.ones(shape=[32, 256, 256, 1], dtype=np.float)
custom_filter[:, 255, :, :] = 0
custom_filter[:, :, 255, :] = 0
custom_filter = tf.constant(custom_filter, dtype=tf.float32)
def discriminator(x, name=None):
with tf.name_scope(name, "discriminator", [x]) as scope:
D_conv_1 = tf.layers.conv2d(inputs=x, filters=16, kernel_size=[
5, 5], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [256, 256]
D_mean_pool_1 = tf.nn.pool(D_conv_1, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [128, 128]
D_conv_2 = tf.layers.conv2d(D_mean_pool_1, filters=32, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [128, 128]
D_mean_pool_2 = tf.nn.pool(D_conv_2, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [64, 64]
D_conv_3 = tf.layers.conv2d(D_mean_pool_2, filters=64, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [64, 64]
D_mean_pool_3 = tf.nn.pool(D_conv_3, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [32, 32]
D_conv_4 = tf.layers.conv2d(D_mean_pool_3, filters=128, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [32, 32]
D_mean_pool_4 = tf.nn.pool(D_conv_4, window_shape=[
2, 2], pooling_type='AVG', padding='VALID', strides=[2, 2])
# [16, 16]
D_conv_5 = tf.layers.conv2d(D_mean_pool_4, filters=256, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [16, 16]
D_mean_pool_5 = tf.nn.pool(D_conv_5, window_shape=[
4, 4], pooling_type='AVG', padding='VALID', strides=[4, 4])
# [4, 4]
D_conv_6 = tf.layers.conv2d(D_mean_pool_5, filters=2, kernel_size=[
3, 3], padding='SAME', activation=activation_function, kernel_regularizer=regularizer)
# [4, 4]
D_mean_pool_6 = tf.nn.pool(D_conv_6, window_shape=[
4, 4], pooling_type='AVG', padding='VALID', strides=[4, 4])
# [1, 1], and finally, [batch_size][1][1][2]
D_logit = tf.reshape(D_mean_pool_6, shape=[32, 2])
# [batch_size][2]
return D_logit
'''
D_hidden_layer_1 = tf.layers.dense(
inputs=x, units=255, activation=activation_function)
D_hidden_layer_2 = tf.layers.dense(
inputs=D_hidden_layer_1, units=16, activation=activation_function)
D_logit = tf.layers.dense(inputs=D_hidden_layer_2, units=Y_dim,
activation=activation_function)
return D_logit
'''
def generator(z, name=None):
with tf.name_scope(name, "generator", [z]) as scope:
# z[32, 4096]
input = tf.reshape(z, shape=[32, 256, 256, 1])
# input[32, 64, 64, 1]
G_conv_1 = tf.layers.conv2d(input, filters=96, kernel_size=[
8, 8], padding='SAME', activation=activation_function)
# [32, 64, 64, 96]
# G_upscaled_1 = tf.image.resize_bicubic(images=G_conv_1, size=[128, 128])
# [32, 128, 128, 96]
G_conv_2 = tf.layers.conv2d(G_conv_1, filters=64, kernel_size=[
5, 5], padding='SAME', activation=activation_function)
# [32, 128, 128, 64]
# G_upscaled_2 = tf.image.resize_bicubic(G_conv_2, size=[256, 256])
# [32, 256, 256, 64]
G_conv_3 = tf.layers.conv2d(G_conv_2, filters=64, kernel_size=[
5, 5], padding='SAME', activation=activation_function)
# [32, 256, 256, 64]
G_conv_4 = tf.layers.conv2d(G_conv_3, filters=1, kernel_size=[
5, 5], padding='SAME', activation=activation_function)
# [32, 256, 256, 1]
G_logit = G_conv_4 * custom_filter
# [32, 256, 256, 1], but filtered out the last column and row
return G_logit
'''
G_hidden_layer_1 = tf.layers.dense(
inputs=z, units=255, activation=activation_function)
G_outputs = tf.layers.dense(inputs=G_hidden_layer_1, units=X_dim,
activation=activation_function)
return G_outputs
'''
with tf.name_scope("training") as scope:
# Getting samples from random data
G_sample = generator(Z)
# Getting logits
D_logit_real = discriminator(X)
D_logit_fake = discriminator(G_sample)
# Applying softmax
D_proba_real = tf.nn.softmax(logits=D_logit_real)
D_proba_real = tf.clip_by_value(
D_proba_real, clip_value_min=1e-4, clip_value_max=1.0)
D_proba_fake = tf.nn.softmax(logits=D_logit_fake)
D_proba_fake = tf.clip_by_value(
D_proba_fake, clip_value_min=1e-4, clip_value_max=1.0)
with tf.name_scope("category_1") as sub_scope:
# Getting Shannon's entrophy in X's distribution
D_log_real = tf.log(D_proba_real)
D_entrophy_real = D_proba_real * D_log_real
D_mean_real = tf.reduce_sum(D_entrophy_real, axis=1)
D_mean_real = -D_mean_real
D_entrophy_real_mean = tf.reduce_mean(D_mean_real, axis=0)
D_entrophy_real_mean = tf.reshape(D_entrophy_real_mean, shape=[1])
with tf.name_scope("category_2") as sub_scope:
# Gettning Shannon's entrophy in Z's distribution
G_log_fake = tf.log(D_proba_fake)
G_entrophy_fake = D_proba_fake * G_log_fake
G_mean = tf.reduce_sum(G_entrophy_fake, axis=1)
G_mean = -G_mean
G_entrophy_fake_mean = tf.reduce_mean(G_mean, axis=0)
G_entrophy_fake_mean = tf.reshape(G_entrophy_fake_mean, shape=[1])
with tf.name_scope("category_3") as sub_scope:
# Getting Shannon's entrophy between classes
D_class_mean = tf.reduce_mean(D_proba_real, axis=0, keep_dims=True)
D_class_mean_log = tf.log(D_class_mean)
D_class_entropy = D_class_mean * D_class_mean_log
D_class = tf.reduce_sum(D_class_entropy, axis=1)
D_class = -D_class
D_class = tf.reshape(D_class, shape=[1])
G_class_mean = tf.reduce_mean(D_proba_fake, axis=0, keep_dims=True)
G_class_mean_log = tf.log(G_class_mean)
G_class_entrophy = G_class_mean * G_class_mean_log
G_class = tf.reduce_sum(G_class_entrophy, axis=1)
G_class = -G_class
G_class = tf.reshape(G_class, shape=[1])
with tf.name_scope("supervised") as sub_scope:
# Getting cross entrophy for labeled data
D_labeled = Y * D_log_real
D_cross_entrophy = tf.reduce_sum(D_labeled, axis=1)
D_cross_entrophy = -D_cross_entrophy
D_supervised = tf.reduce_mean(D_cross_entrophy, axis=0)
D_supervised_weighted = value_lambda * D_supervised
D_supervised_weighted = tf.reshape(D_supervised_weighted, shape=[1])
D_loss = D_class - D_entrophy_real_mean + \
G_entrophy_fake_mean + D_supervised_weighted
G_loss = -G_class + G_entrophy_fake_mean
D_loss = -D_loss
D_solver = tf.train.AdamOptimizer().minimize(D_loss)
G_solver = tf.train.AdamOptimizer().minimize(G_loss)
# with tf.name_scope("testing") as scope:
I've done some research and asked some questions to my friends who works in a big company doing research on deep learning. As it turns out that the generative adversarial networks is not good at classification jobs. So I changed my mind and implemented it with GoogLenet. Problem solved!

CNN for cifar10 dataset in Tensorflow

I am trying to replicate results obtained by a convolutional neural network for CIFAR10 using Tensorflow, however after some epochs (~60 epochs) my performance (accuracy) is around 10%, so I do not if the CNN is well trained?
This code is based on Deep mnist for experts https://www.tensorflow.org/get_started/mnist/pros , however in Cifar10 it does not work
import numpy as np
import tensorflow as tf
def unpickle(file):
import cPickle
fo = open(file, 'rb')
dict = cPickle.load(fo)
fo.close()
return dict
#unpacking training and test data
b1 = unpickle("~/cifar-10-batches-py/data_batch_1")
b2 = unpickle("~/cifar-10-batches-py/data_batch_2")
b3 = unpickle("~/cifar-10-batches-py/data_batch_3")
b4 = unpickle("~/cifar-10-batches-py/data_batch_4")
b5 = unpickle("~/cifar-10-batches-py/data_batch_5")
test = unpickle("~/cifar-10-batches-py/test_batch")
#Preparing test data
test_data = test['data']
test_label = test['labels']
#Preparing training data
train_data = np.concatenate([b1['data'],b2['data'],b3['data'],b4['data'],b5['data']],axis=0)
train_label = np.concatenate([b1['labels'],b2['labels'],b3['labels'],b4['labels'],b5['labels']],axis=0)
#Reshaping data
train_data = np.reshape(train_data,[50000,32,32,3])
test_data = np.reshape(test_data,[10000,32,32,3])
batch_size = 100
image_width = 32
image_height = 32
channels = 3
#Constructing Graph
x = tf.placeholder(tf.float32, [None, image_width, image_height, channels])#Training Data
y = tf.placeholder(tf.int32, [None])
one_hot = tf.one_hot(y,depth=10)#Converting in one hot vectors
#Constructing CNN Layers
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
#Given an input tensor of shape [batch, in_height, in_width, in_channels] and a filter / kernel tensor of shape [filter_height, filter_width, in_channels, out_channels], taken from: http://textminingonline.com/dive-into-tensorflow-part-v-deep-mnist
W_conv1 = weight_variable([7, 7, 3, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 32])
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_conv3 = weight_variable([5, 5, 32, 64])
b_conv3 = bias_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
#Constructing MLP layers
W_fc1 = weight_variable([8 * 8 * 64, 64])
b_fc1 = bias_variable([64])
h_pool3_flat = tf.reshape(h_conv3, [-1, 8*8*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
W_fc2 = weight_variable([64, 10])
b_fc2 = bias_variable([10])
y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
#Computing Cost function
cross_entropy = -tf.reduce_sum(one_hot*tf.log(tf.clip_by_value(y_conv,1e-10,1e20)))
train_step = tf.train.MomentumOptimizer(learning_rate = 0.0001, momentum = 0.9).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(one_hot,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.initialize_all_variables()
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=16))
sess.run(init)
epochs = 100
b_per = 0
row = []
for e in range(epochs):
print( "epoch", e)
avg_cost = 0
#foreach batch
for j in range(int(train_data.shape[0]/batch_size)):
subset=range((j*batch_size),((j+1)*batch_size))
data = train_data[subset,:,:,:]
label = train_label[subset]
_,c = sess.run([train_step,cross_entropy], feed_dict={x: data, y: label})
avg_cost += c / data.shape[0]
#print(avg_cost)
b_per = b_per + 1
if b_per%10==0 :
row.append(sess.run(accuracy, feed_dict={x: test_data, y: test_label }))
print(row[-1])
It is wrong in data reshape part! It should be,
# Reshaping data
train_data = train_data.reshape(50000, 3, 32, 32).transpose(
0, 2, 3, 1).astype("uint8")
test_data = test_data.reshape(10000, 3, 32, 32).transpose(
0, 2, 3, 1).astype("uint8")

Why does CNN with constant initialization learn at all?

Usually, weights for neural networks are initialized randomly so that they receive different gradients and learn different weights. In theory, if all weights are initialized the same way, all nodes will have the same weights no matter how long you train. Thus the training shouldn't work at all.
However, the code below gives 56% accuracy on MNIST after 7000 epochs. Why is that the case?
Code
#!/usr/bin/env python
"""MNIST with Tensorflow."""
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import os
import numpy as np
epochs = 20000
model_checkpoint_path = 'checkpoints/mnist_tf_model.ckpt'
def weight_variable(shape):
#initial = tf.truncated_normal(shape, stddev=0.01)
initial = tf.constant(0.0, shape=shape)
return tf.get_variable(initializer=initial, name='weights')
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.get_variable(initializer=initial, name='biases')
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
def eval_network(sess, summary_writer, dataset, correct_prediction, epoch):
correct_sum = 0
total_test = 0
training_summary = tf.get_default_graph().get_tensor_by_name("training_accuracy:0")
loss_summary = tf.get_default_graph().get_tensor_by_name("loss:0")
for i in range(dataset.labels.shape[0] / 1000):
feed_dict = {x: dataset.images[i * 1000:(i + 1) * 1000],
y_: dataset.labels[i * 1000:(i + 1) * 1000]}
[test_correct, train_summ, loss_summ] = sess.run([correct_prediction,
training_summary,
loss_summary],
feed_dict=feed_dict)
summary_writer.add_summary(train_summ, epoch)
summary_writer.add_summary(loss_summ, epoch)
test_correct = correct_prediction.eval(feed_dict=feed_dict)
correct_sum += sum(test_correct)
total_test += len(test_correct)
return float(correct_sum) / total_test
def log_score(sess, summary_writer, filename, mnist, scoring, epoch):
with open(filename, "a") as myfile:
train = eval_network(sess, summary_writer, mnist.train, scoring, epoch)
test = eval_network(sess, summary_writer, mnist.test, scoring, epoch)
myfile.write("%i;%0.6f;%0.6f\n" % (epoch, train, test))
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
with tf.Session() as sess:
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
x_image = tf.reshape(x, [-1, 28, 28, 1])
with tf.variable_scope('conv1') as scope:
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1, name='ReLU1')
h_pool1 = max_pool_2x2(h_conv1)
with tf.variable_scope('conv2') as scope:
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2, name='ReLU2')
h_pool2 = max_pool_2x2(h_conv2)
with tf.variable_scope('fc1'):
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
with tf.variable_scope('softmax'):
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv * 10**-7),
reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.scalar_summary("training_accuracy", accuracy, name="training_accuracy")
tf.scalar_summary("loss", cross_entropy, name="loss")
summary_writer = tf.train.SummaryWriter('summary_dir', sess.graph)
sess.run(tf.initialize_all_variables())
for i in range(epochs):
batch = mnist.train.next_batch(50)
if i % 100 == 0:
log_score(sess, summary_writer,
'validation-curve-accuracy.csv',
mnist, correct_prediction, i)
train_step.run(feed_dict={x: batch[0],
y_: batch[1]})
log_score(sess, summary_writer, 'validation-curve-accuracy.csv',
mnist, correct_prediction, epochs)
Plots
Nr 1
After adding 10**-7 to the tf.log(..) term, the NANs are gone:
Nr 2
This is an old plot which did have a problem due to log(0) after 16k epochs.
The loss is plotted here. The triangles are NANs.
Here is the accuracy - due to the smoothing, it does not directly fall to ~10%.

Resources