simple linear regression by tensorflow - machine-learning
I am a beginner in tensorflow and machine learning. I want to try a simple linear regression example by tensorflow.
But the loss can't decrease after 3700 epoch. I don't know what's wrong?
Obviously, we got the W = 3.52, b = 2.8865. So y = 3.52*x + 2.8865. When testing data x = 11, y = 41.6065. But this is error. Because the training data x = 10, y = 48.712.
The code and loss posted in below.
#Goal: predict the house price in 2017 by linear regression method
#Step: 1. load the original data
# 2. define the placeholder and variable
# 3. linear regression method
# 4. launch the graph
from __future__ import print_function
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# 1. load the original data
price = np.asarray([6.757, 12.358, 10.091, 11.618, 14.064,
16.926, 17.673, 22.271, 26.905, 34.742, 48.712])
year = np.asarray([0,1,2,3,4,5,6,7,8,9,10])
n_samples = price.shape[0]
# 2. define the placeholder and variable
x = tf.placeholder("float")
y_ = tf.placeholder("float")
W = tf.Variable(np.random.randn())
b = tf.Variable(np.random.randn())
# 3. linear regression method
y = tf.add(tf.multiply(x, W), b)
loss = tf.reduce_mean(tf.square(y - y_))/(2*n_samples)
training_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
# 4. launch the graph
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(10000):
for (year_epoch, price_epoch) in zip(year, price):
sess.run(training_step, feed_dict = {x: year_epoch, y_: price_epoch})
if (epoch+1) % 50 == 0:
loss_np = sess.run(loss, feed_dict={x: year, y_: price})
print("Epoch: ", '%04d' % (epoch+1), "loss = ", "{:.9f}".format(loss_np), "W = ", sess.run(W), "b = ", sess.run(b))
# print "Training finish"
training_loss = sess.run(loss, feed_dict = {x: year, y_: price})
print("Training cost = ", training_loss, "W = ", sess.run(W), "b = ", sess.run(b), '\n')
And the loss is:
Epoch: 0050 loss = 1.231071353 W = 3.88227 b = 0.289058
Epoch: 0100 loss = 1.207471132 W = 3.83516 b = 0.630129
Epoch: 0150 loss = 1.189429402 W = 3.79423 b = 0.926415
Epoch: 0200 loss = 1.175611973 W = 3.75868 b = 1.1838
Epoch: 0250 loss = 1.165009260 W = 3.72779 b = 1.40738
Epoch: 0300 loss = 1.156855702 W = 3.70096 b = 1.60161
Epoch: 0350 loss = 1.150570631 W = 3.67766 b = 1.77033
Epoch: 0400 loss = 1.145712137 W = 3.65741 b = 1.9169
Epoch: 0450 loss = 1.141945601 W = 3.63982 b = 2.04422
Epoch: 0500 loss = 1.139016271 W = 3.62455 b = 2.15483
Epoch: 0550 loss = 1.136731029 W = 3.61127 b = 2.25091
Epoch: 0600 loss = 1.134940267 W = 3.59974 b = 2.33437
Epoch: 0650 loss = 1.133531928 W = 3.58973 b = 2.40688
Epoch: 0700 loss = 1.132419944 W = 3.58103 b = 2.46986
Epoch: 0750 loss = 1.131537557 W = 3.57347 b = 2.52458
Epoch: 0800 loss = 1.130834818 W = 3.5669 b = 2.57211
Epoch: 0850 loss = 1.130271792 W = 3.5612 b = 2.6134
Epoch: 0900 loss = 1.129818439 W = 3.55625 b = 2.64927
Epoch: 0950 loss = 1.129452229 W = 3.55194 b = 2.68042
Epoch: 1000 loss = 1.129154325 W = 3.5482 b = 2.70749
Epoch: 1050 loss = 1.128911495 W = 3.54496 b = 2.731
Epoch: 1100 loss = 1.128711581 W = 3.54213 b = 2.75143
Epoch: 1150 loss = 1.128546953 W = 3.53968 b = 2.76917
Epoch: 1200 loss = 1.128411174 W = 3.53755 b = 2.78458
Epoch: 1250 loss = 1.128297567 W = 3.53571 b = 2.79797
Epoch: 1300 loss = 1.128202677 W = 3.5341 b = 2.8096
Epoch: 1350 loss = 1.128123403 W = 3.5327 b = 2.81971
Epoch: 1400 loss = 1.128056765 W = 3.53149 b = 2.82849
Epoch: 1450 loss = 1.128000259 W = 3.53044 b = 2.83611
Epoch: 1500 loss = 1.127952814 W = 3.52952 b = 2.84274
Epoch: 1550 loss = 1.127912283 W = 3.52873 b = 2.84849
Epoch: 1600 loss = 1.127877355 W = 3.52804 b = 2.85349
Epoch: 1650 loss = 1.127847791 W = 3.52744 b = 2.85783
Epoch: 1700 loss = 1.127822518 W = 3.52692 b = 2.8616
Epoch: 1750 loss = 1.127801418 W = 3.52646 b = 2.86488
Epoch: 1800 loss = 1.127782702 W = 3.52607 b = 2.86773
Epoch: 1850 loss = 1.127766728 W = 3.52573 b = 2.8702
Epoch: 1900 loss = 1.127753139 W = 3.52543 b = 2.87234
Epoch: 1950 loss = 1.127740979 W = 3.52517 b = 2.87421
Epoch: 2000 loss = 1.127731323 W = 3.52495 b = 2.87584
Epoch: 2050 loss = 1.127722263 W = 3.52475 b = 2.87725
Epoch: 2100 loss = 1.127714872 W = 3.52459 b = 2.87847
Epoch: 2150 loss = 1.127707958 W = 3.52444 b = 2.87953
Epoch: 2200 loss = 1.127702117 W = 3.52431 b = 2.88045
Epoch: 2250 loss = 1.127697825 W = 3.5242 b = 2.88126
Epoch: 2300 loss = 1.127693415 W = 3.52411 b = 2.88195
Epoch: 2350 loss = 1.127689362 W = 3.52402 b = 2.88255
Epoch: 2400 loss = 1.127686620 W = 3.52395 b = 2.88307
Epoch: 2450 loss = 1.127683759 W = 3.52389 b = 2.88352
Epoch: 2500 loss = 1.127680898 W = 3.52383 b = 2.88391
Epoch: 2550 loss = 1.127679348 W = 3.52379 b = 2.88425
Epoch: 2600 loss = 1.127677798 W = 3.52374 b = 2.88456
Epoch: 2650 loss = 1.127675653 W = 3.52371 b = 2.88483
Epoch: 2700 loss = 1.127674222 W = 3.52368 b = 2.88507
Epoch: 2750 loss = 1.127673268 W = 3.52365 b = 2.88526
Epoch: 2800 loss = 1.127672315 W = 3.52362 b = 2.88543
Epoch: 2850 loss = 1.127671123 W = 3.5236 b = 2.88559
Epoch: 2900 loss = 1.127670288 W = 3.52358 b = 2.88572
Epoch: 2950 loss = 1.127670050 W = 3.52357 b = 2.88583
Epoch: 3000 loss = 1.127669215 W = 3.52356 b = 2.88592
Epoch: 3050 loss = 1.127668500 W = 3.52355 b = 2.88599
Epoch: 3100 loss = 1.127668381 W = 3.52354 b = 2.88606
Epoch: 3150 loss = 1.127667665 W = 3.52353 b = 2.88615
Epoch: 3200 loss = 1.127667546 W = 3.52352 b = 2.88621
Epoch: 3250 loss = 1.127667069 W = 3.52351 b = 2.88626
Epoch: 3300 loss = 1.127666950 W = 3.5235 b = 2.8863
Epoch: 3350 loss = 1.127666354 W = 3.5235 b = 2.88633
Epoch: 3400 loss = 1.127666593 W = 3.5235 b = 2.88637
Epoch: 3450 loss = 1.127666593 W = 3.52349 b = 2.8864
Epoch: 3500 loss = 1.127666235 W = 3.52349 b = 2.88644
Epoch: 3550 loss = 1.127665997 W = 3.52348 b = 2.88646
Epoch: 3600 loss = 1.127665639 W = 3.52348 b = 2.88648
Epoch: 3650 loss = 1.127665639 W = 3.52348 b = 2.88649
Epoch: 3700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 10000 loss = 1.127665997 W = 3.52348 b = 2.8865
Training cost = 1.12767 W = 3.52348 b = 2.8865
Your hypothesis of assuming the predicted output lies in a straight line is not correct. Check how the plot of year and price is.
So the linear hypothesis which you have taken will try it's best to fit in a straight line by satisfying as many input points as possible to reduce the cost. So when you are testing for a point which is outside the range, it will predict in the straight line which is the best optimized for the set of inputs you provided.
Now, you have mentioned two problems.
1. Cost is not going down: Try reducing the learning rate. Your cost will definitely go down.
2. Your output for year = 11 is wrong: The reason I have mentioned already above. What you need to do is you have to change the hypothesis. Include a square term and then check it. Example: y = ax^2 + bx + c. You will get a better fit with this hypothesis equation.
Related
Neural network cost not decreasing with gradient descent
Im working on a small neural network in python and i'm having issues figuring out why the cost doesn't go down. Any ideas/hints would be appreciated import numpy as np X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) Y = np.array([[0, 1, 1, 0]]) w1 = np.array([[0.1, 0.2],[0.3,0.4],[0.5,0.6]]) b1 = np.zeros((3,1)) w2 = np.array([0.1, 0.2, 0.3]).reshape(1,3) b2 = np.zeros((1,1)) cache = {} def sigmoid(x): return 1 / (1 + np.exp(-x)) def sigmoid_prime(x): return sigmoid(x) * (1 - sigmoid(x)) def forward(X, w1, w2, b1, b2): Z1 = np.dot(w1, X) + b1 A1 = sigmoid(Z1) Z2 = np.dot(w2, A1) + b2 A2 = sigmoid(Z2) return {'x': X, 'Z1': Z1, 'A1': A1, 'Z2': Z2, 'A2': A2} def backward(X, Z1, A1, Z2, A2, error_gradient, w1, w2, b1, b2, learning_rate=0.01): # LAYER 2 dA2 = np.multiply(error_gradient, sigmoid_prime(Z2)) dZ2 = np.dot(w2.T, dA2) # update w and b for layer 2 dw2 = np.dot(dA2, A2.T) db2 = dA2 w2 -= dw2 * learning_rate b2 -= db2 * learning_rate # LAYER 1 dA1 = np.multiply(dZ2, sigmoid_prime(Z1)) dZ1 = np.dot(w1.T, dA1) # update w and b for layer 1 dw1 = np.dot(dA1, X.T) db1 = dA1 w1 -= dw1 * learning_rate b1 -= db1 * learning_rate return {'x': X, 'dZ1': dZ1, 'dA1': dA1, 'dZ2': dZ2, 'dA2': dA2, 'w2': w2, 'b2': b2, 'w1': w1, 'b1': b1} def calculate_cost(y, y_guess): cost = np.power(y - y_guess, 2) return np.squeeze(cost) def mse_prime(y, y_pred): return 2 * (y - y_pred) def predict(X, w1, w2, b1, b2): return forward(X, w1, w2, b1, b2) def train(X, Y, w1, w2, b1, b2, epochs=100, learning_rate=0.01): for epoch in range(epochs): cost = 0 for i, val in enumerate(X): x = val.reshape(2,1) out = predict(x, w1, w2, b1, b2) y_guess = out["A2"] #print(out) cost += calculate_cost(Y[0][i], y_guess) error_gradient = mse_prime(Y[0][i], y_guess) # print(error_gradient) back = backward(x, out["Z1"], out["A1"], out["Z2"], out["A2"], error_gradient, w1, w2, b1, b2) # update params w1 = back["w1"] b1 = back["b1"] w2 = back["w2"] b2 = back["b2"] print(f"epoch: {epoch + 1}/{epochs}, cost: {cost/X.shape[0]}") train(X, Y, w1, w2, b1, b2, epochs=20) Cost output epoch: 1/20, cost: 0.25703296560961486 epoch: 2/20, cost: 0.25718506279033615 epoch: 3/20, cost: 0.25734002245320176 epoch: 4/20, cost: 0.25749789408142415 epoch: 5/20, cost: 0.25765872780276317 epoch: 6/20, cost: 0.25782257438803613 epoch: 7/20, cost: 0.25798948524907084 epoch: 8/20, cost: 0.2581595124360765 epoch: 9/20, cost: 0.2583327086344036 epoch: 10/20, cost: 0.25850912716066776 epoch: 11/20, cost: 0.2586888219582088 epoch: 12/20, cost: 0.25887184759185666 epoch: 13/20, cost: 0.2590582592419748 epoch: 14/20, cost: 0.2592481126977533 epoch: 15/20, cost: 0.2594414643497189 epoch: 16/20, cost: 0.25963837118143357 epoch: 17/20, cost: 0.2598388907603498 epoch: 18/20, cost: 0.2600430812277913 epoch: 19/20, cost: 0.2602510012880266 epoch: 20/20, cost: 0.26046271019640493
Turns out I had these 2 errors MSE function: Change order in substraction (the order does matter) dw2 dot product using correct matrix (A1) here is the fully working code import numpy as np np.random.seed(1) X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) #X = np.array([[0, 0]]) Y = np.array([[0, 1, 1, 0]]) w1 = np.random.randn(3, 2) * 0.1 b1 = np.zeros((3,1)) w2 = np.random.randn(1, 3) * 0.1 b2 = np.zeros((1,1)) cache = {} def sigmoid(x): return 1 / (1 + np.exp(-x)) def sigmoid_prime(x): return sigmoid(x) * (1 - sigmoid(x)) def forward(X, w1, w2, b1, b2): Z1 = np.dot(w1, X) + b1 A1 = sigmoid(Z1) Z2 = np.dot(w2, A1) + b2 A2 = sigmoid(Z2) return {'x': X, 'Z1': Z1, 'A1': A1, 'Z2': Z2, 'A2': A2} def backward(X, Z1, A1, Z2, A2, error_gradient, w1, w2, b1, b2, learning_rate=0.1): # LAYER 2 dA2 = np.multiply(error_gradient, sigmoid_prime(Z2)) dZ2 = np.dot(w2.T, dA2) # update w and b for layer 2 dw2 = np.dot(dA2, A1.T) db2 = dA2 w2 -= dw2 * learning_rate b2 -= db2 * learning_rate # LAYER 1 dA1 = np.multiply(dZ2, sigmoid_prime(Z1)) dZ1 = np.dot(w1.T, dA1) # update w and b for layer 1 dw1 = np.dot(dA1, X.T) db1 = dA1 w1 -= dw1 * learning_rate #print(db1 * learning_rate) b1 -= db1 * learning_rate return {'x': X, 'dZ1': dZ1, 'dA1': dA1, 'dZ2': dZ2, 'dA2': dA2, 'w2': w2, 'b2': b2, 'w1': w1, 'b1': b1, 'dw2': dw2, 'db2': db2, 'dw1': dw1, 'db1': db1} def calculate_cost(y, y_guess): cost = np.power(y - y_guess, 2) return np.squeeze(cost) def mse_prime(y, y_pred): return 2 * (y_pred - y) def predict(X, w1, w2, b1, b2): return forward(X, w1, w2, b1, b2) def train(X, Y, w1, w2, b1, b2, epochs=100, learning_rate=0.01): for epoch in range(epochs): cost = 0 for i, val in enumerate(X): x = val.reshape(2,1) out = predict(x, w1, w2, b1, b2) y_guess = out["A2"] cost += calculate_cost(Y[0][i], y_guess) error_gradient = mse_prime(Y[0][i], y_guess) back = backward(x, out["Z1"], out["A1"], out["Z2"], out["A2"], error_gradient, w1, w2, b1, b2) # update params w1 = back["w1"] b1 = back["b1"] w2 = back["w2"] b2 = back["b2"] print(f"epoch: {epoch + 1}/{epochs}, cost: {cost/X.shape[0]}") train(X, Y, w1, w2, b1, b2, epochs=10000, learning_rate=0.1)
Pytorch model doesn't learn identity function?
I wrote some models in pytorch which was not able to learn anything even after many epochs. In order to debug the problem I made a simple model which models identity function of an input. The difficulty is this model also doesn't learn nothing despite training for 50k epochs, import torch import torch.nn as nn torch.manual_seed(1) class Net(nn.Module): def __init__(self): super().__init__() self.input = nn.Linear(2,4) self.hidden = nn.Linear(4,4) self.output = nn.Linear(4,2) self.relu = nn.ReLU() self.softmax = nn.Softmax(dim=1) self.dropout = nn.Dropout(0.5) def forward(self,x): x = self.input(x) x = self.dropout(x) x = self.relu(x) x = self.hidden(x) x = self.dropout(x) x = self.relu(x) x = self.output(x) x = self.softmax(x) return x X = torch.tensor([[1,0],[1,0],[0,1],[0,1]],dtype=torch.float) net = Net() criterion = nn.CrossEntropyLoss() opt = torch.optim.Adam(net.parameters(), lr=0.001) for i in range(100000): opt.zero_grad() y = net(X) loss = criterion(y,torch.argmax(X,dim=1)) loss.backward() if i%500 ==0: print("Epoch: ",i) print(torch.argmax(y,dim=1).detach().numpy().tolist()) print("Loss: ",loss.item()) print() Output Epoch: 52500 [0, 0, 1, 0] Loss: 0.6554909944534302 Epoch: 53000 [0, 0, 0, 0] Loss: 0.7004914283752441 Epoch: 53500 [0, 0, 0, 0] Loss: 0.7156486511230469 Epoch: 54000 [0, 0, 0, 0] Loss: 0.7171240448951721 Epoch: 54500 [0, 0, 0, 0] Loss: 0.691678524017334 Epoch: 55000 [0, 0, 0, 0] Loss: 0.7301554679870605 Epoch: 55500 [0, 0, 0, 0] Loss: 0.728650689125061 What is wrong with my implementation?
There are a few mistakes: Missing optimizer.step(): optimizer.step() updates the parameters based on backpropagated gradients and other accumulated momentum and all. Usage of softmax with CrossEntropy Loss: Pytorch CrossEntropyLoss criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class. i.e. it applies softmax then takes negative log. So in your case you are taking softmax(softmax(output)). Correct way is use linear output layer while training and use softmax layer or just take argmax for prediction. High dropout value for small network: Which results in underfitting. Here's the corrected code: import torch import torch.nn as nn torch.manual_seed(1) class Net(nn.Module): def __init__(self): super().__init__() self.input = nn.Linear(2,4) self.hidden = nn.Linear(4,4) self.output = nn.Linear(4,2) self.relu = nn.ReLU() self.softmax = nn.Softmax(dim=1) # self.dropout = nn.Dropout(0.0) def forward(self,x): x = self.input(x) # x = self.dropout(x) x = self.relu(x) x = self.hidden(x) # x = self.dropout(x) x = self.relu(x) x = self.output(x) # x = self.softmax(x) return x def predict(self, x): with torch.no_grad(): out = self.forward(x) return self.softmax(out) X = torch.tensor([[1,0],[1,0],[0,1],[0,1]],dtype=torch.float) net = Net() criterion = nn.CrossEntropyLoss() opt = torch.optim.Adam(net.parameters(), lr=0.001) for i in range(100000): opt.zero_grad() y = net(X) loss = criterion(y,torch.argmax(X,dim=1)) loss.backward() # This was missing before opt.step() if i%500 ==0: print("Epoch: ",i) pred = net.predict(X) print(f'prediction: {torch.argmax(pred, dim=1).detach().numpy().tolist()}, actual: {torch.argmax(X,dim=1)}') print("Loss: ", loss.item()) Output: Epoch: 0 prediction: [0, 0, 0, 0], actual: tensor([0, 0, 1, 1]) Loss: 0.7042869329452515 Epoch: 500 prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1]) Loss: 0.1166711300611496 Epoch: 1000 prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1]) Loss: 0.05215628445148468 Epoch: 1500 prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1]) Loss: 0.02993333339691162 Epoch: 2000 prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1]) Loss: 0.01916157826781273 Epoch: 2500 prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1]) Loss: 0.01306679006665945 Epoch: 3000 prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1]) Loss: 0.009280549362301826 . . .
Tensorflow multi-GPU MNIST classifier: low accuracy
I am stuck with multiple GPU MNIST classifier in Tensorflow. Code runs without errors, but accuracy is very poor (30%). I am new to Tensorflow so I do not know where is the problem ? GPU: 2x GTX 1080 Ti. I have found several tutorials for multiple GPU, but code is hard to follow. For this reason I am trying to develop MNIST CNN classifier from scratch. from __future__ import print_function from tensorflow.examples.tutorials.mnist import input_data import tensorflow as tf import datetime def average_gradients(tower_grads): average_grads = [] for grad_and_vars in zip(*tower_grads): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) grads = [] for g, _ in grad_and_vars: # Add 0 dimension to the gradients to represent the tower. expanded_g = tf.expand_dims(g, 0) # Append on a 'tower' dimension which we will average over below. grads.append(expanded_g) # Average over the 'tower' dimension. grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared # across towers. So .. we will just return the first tower's pointer to # the Variable. v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) return average_grads with tf.device('/cpu:0'): x = tf.placeholder(tf.float32, [None, 784], name='x') x_img=tf.reshape(x, [-1, 28, 28, 1]) x_dict={} x_dict['x0'],x_dict['x1'] = tf.split(x_img,2) y_dict={} y = tf.placeholder(tf.float32, [None, 10], name='y') y_dict['y0'],y_dict['y1'] = tf.split(y,2) opt=tf.train.GradientDescentOptimizer(0.01) keep_prob = tf.placeholder(tf.float32) w0=tf.get_variable('w0',initializer=tf.truncated_normal([5, 5,1,32], stddev=0.1)) b0=tf.get_variable('b0',initializer=tf.zeros([32])) w1=tf.get_variable('w1',initializer=tf.truncated_normal([5,5,32,64], stddev=0.1)) b1=tf.get_variable('b1',initializer=tf.zeros([64])) w2=tf.get_variable('w2',initializer=tf.truncated_normal([7*7*64,1024], stddev=0.1)) b2=tf.get_variable('b2',initializer=tf.zeros([1024])) w3=tf.get_variable('w3',initializer=tf.truncated_normal([1024,10], stddev=0.1)) b3=tf.get_variable('b3',initializer=tf.zeros([10])) grads=[] def conv2d(xx, W): return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_2x2(xx): return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1],strides=[1, 2, 2, 1], padding='SAME') def model_forward(xx): h_conv1=tf.nn.relu(conv2d(xx,w0)+b0); h_pool1=max_pool_2x2(h_conv1) h_conv2=tf.nn.relu(conv2d(h_pool1,w1)+b1); h_pool2=max_pool_2x2(h_conv2) h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,w2)+b2) h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) y = tf.nn.sigmoid(tf.matmul(h_fc1_drop,w3)+b3) return y for i in range(0,2): with tf.device(('/gpu:{0}').format(i)): with tf.variable_scope(('scope_gpu_{0}').format(i)): yy=model_forward(x_dict[('x{0}').format(i)]) cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1])) grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables())) with tf.device('/cpu:0'): grad = average_gradients(grads) train_step = opt.apply_gradients(grad) yy=model_forward(x_dict['x0']) correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y_dict['y0'], 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy') def main(): mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('C:\\tmp\\test\\', graph=tf.get_default_graph()) t1_1 = datetime.datetime.now() for step in range(0,10000): batch_x, batch_y = mnist.train.next_batch(100) sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5}) if (step % 200) == 0: print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1})) t2_1 = datetime.datetime.now() print("Computation time: " + str(t2_1-t1_1)) if __name__ == "__main__": main()
The problems that I noticed: Your cross-entropy loss is wrong (see this question for details, in short you're computing binary cross-entropy). I dropped manual gradient computation in favor of tf.train.AdamOptimizer. I dropped the split of the input of x (it's not the right way to do distributed computation in tensorflow). The result model easily gets to 99% accuracy even on one GPU. from tensorflow.examples.tutorials.mnist import input_data import tensorflow as tf import datetime x = tf.placeholder(tf.float32, [None, 784], name='x') x_img = tf.reshape(x, [-1, 28, 28, 1]) y = tf.placeholder(tf.float32, [None, 10], name='y') keep_prob = tf.placeholder(tf.float32) stddev = 0.1 w0 = tf.get_variable('w0', initializer=tf.truncated_normal([5, 5, 1, 32], stddev=stddev)) b0 = tf.get_variable('b0', initializer=tf.zeros([32])) w1 = tf.get_variable('w1', initializer=tf.truncated_normal([5, 5, 32, 64], stddev=stddev)) b1 = tf.get_variable('b1', initializer=tf.zeros([64])) w2 = tf.get_variable('w2', initializer=tf.truncated_normal([7 * 7 * 64, 1024], stddev=stddev)) b2 = tf.get_variable('b2', initializer=tf.zeros([1024])) w3 = tf.get_variable('w3', initializer=tf.truncated_normal([1024, 10], stddev=stddev)) b3 = tf.get_variable('b3', initializer=tf.zeros([10])) def conv2d(xx, W): return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_2x2(xx): return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') def model_forward(xx): h_conv1 = tf.nn.relu(conv2d(xx, w0) + b0) h_pool1 = max_pool_2x2(h_conv1) h_conv2 = tf.nn.relu(conv2d(h_pool1, w1) + b1) h_pool2 = max_pool_2x2(h_conv2) h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w2) + b2) h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) y = tf.matmul(h_fc1_drop, w3) + b3 return y yy = model_forward(x_img) loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=yy, labels=y)) train_step = tf.train.AdamOptimizer().minimize(loss) correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy') def main(): mnist = input_data.read_data_sets("/home/maxim/p/data/mnist-tf", one_hot=True) with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess: sess.run(tf.global_variables_initializer()) t1_1 = datetime.datetime.now() for step in range(0, 10000): batch_x, batch_y = mnist.train.next_batch(100) sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5}) if (step % 200) == 0: print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1})) t2_1 = datetime.datetime.now() print("Computation time: " + str(t2_1 - t1_1)) if __name__ == "__main__": main() Now, if you really want it, you can do data or model parallelism to utilize your GPU power (there is a great post about it, but sometimes it doesn't render correctly due to hosting problems).
Along with the points mentioned in the first two answers, take a look at return average_grads in average_gradients function, it's returning from the 1st iteration of the first for loop, meaning the gradients will only apply to the first variable (probably w0). Hence only w0 is getting updated and so you are getting a very low accuracy since the rest of the variables stay to their original values (either random/zeros).
This is because the model is not using the same weights & biases for inference on CPU as well as on the other GPU devices. For example: for i in range(0,2): with tf.device(('/gpu:{0}').format(i)): with tf.variable_scope(('scope_gpu_{0}').format(i)) as infer_scope: yy=model_forward(x_dict[('x{0}').format(i)]) infer_scope.reuse_variables() cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1])) grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables())) The reason you are getting low accuracy is that without specifying reuse_variables() and you try to call the model inference inside each epoch, the graph would create a new model with random weights & biases initialization, which is not what you favored.
CNN for cifar10 dataset in Tensorflow
I am trying to replicate results obtained by a convolutional neural network for CIFAR10 using Tensorflow, however after some epochs (~60 epochs) my performance (accuracy) is around 10%, so I do not if the CNN is well trained? This code is based on Deep mnist for experts https://www.tensorflow.org/get_started/mnist/pros , however in Cifar10 it does not work import numpy as np import tensorflow as tf def unpickle(file): import cPickle fo = open(file, 'rb') dict = cPickle.load(fo) fo.close() return dict #unpacking training and test data b1 = unpickle("~/cifar-10-batches-py/data_batch_1") b2 = unpickle("~/cifar-10-batches-py/data_batch_2") b3 = unpickle("~/cifar-10-batches-py/data_batch_3") b4 = unpickle("~/cifar-10-batches-py/data_batch_4") b5 = unpickle("~/cifar-10-batches-py/data_batch_5") test = unpickle("~/cifar-10-batches-py/test_batch") #Preparing test data test_data = test['data'] test_label = test['labels'] #Preparing training data train_data = np.concatenate([b1['data'],b2['data'],b3['data'],b4['data'],b5['data']],axis=0) train_label = np.concatenate([b1['labels'],b2['labels'],b3['labels'],b4['labels'],b5['labels']],axis=0) #Reshaping data train_data = np.reshape(train_data,[50000,32,32,3]) test_data = np.reshape(test_data,[10000,32,32,3]) batch_size = 100 image_width = 32 image_height = 32 channels = 3 #Constructing Graph x = tf.placeholder(tf.float32, [None, image_width, image_height, channels])#Training Data y = tf.placeholder(tf.int32, [None]) one_hot = tf.one_hot(y,depth=10)#Converting in one hot vectors #Constructing CNN Layers def weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_2x2(x): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') #Given an input tensor of shape [batch, in_height, in_width, in_channels] and a filter / kernel tensor of shape [filter_height, filter_width, in_channels, out_channels], taken from: http://textminingonline.com/dive-into-tensorflow-part-v-deep-mnist W_conv1 = weight_variable([7, 7, 3, 32]) b_conv1 = bias_variable([32]) h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1) h_pool1 = max_pool_2x2(h_conv1) W_conv2 = weight_variable([5, 5, 32, 32]) b_conv2 = bias_variable([32]) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) h_pool2 = max_pool_2x2(h_conv2) W_conv3 = weight_variable([5, 5, 32, 64]) b_conv3 = bias_variable([64]) h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3) #Constructing MLP layers W_fc1 = weight_variable([8 * 8 * 64, 64]) b_fc1 = bias_variable([64]) h_pool3_flat = tf.reshape(h_conv3, [-1, 8*8*64]) h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1) W_fc2 = weight_variable([64, 10]) b_fc2 = bias_variable([10]) y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2) #Computing Cost function cross_entropy = -tf.reduce_sum(one_hot*tf.log(tf.clip_by_value(y_conv,1e-10,1e20))) train_step = tf.train.MomentumOptimizer(learning_rate = 0.0001, momentum = 0.9).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(one_hot,1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=16)) sess.run(init) epochs = 100 b_per = 0 row = [] for e in range(epochs): print( "epoch", e) avg_cost = 0 #foreach batch for j in range(int(train_data.shape[0]/batch_size)): subset=range((j*batch_size),((j+1)*batch_size)) data = train_data[subset,:,:,:] label = train_label[subset] _,c = sess.run([train_step,cross_entropy], feed_dict={x: data, y: label}) avg_cost += c / data.shape[0] #print(avg_cost) b_per = b_per + 1 if b_per%10==0 : row.append(sess.run(accuracy, feed_dict={x: test_data, y: test_label })) print(row[-1])
It is wrong in data reshape part! It should be, # Reshaping data train_data = train_data.reshape(50000, 3, 32, 32).transpose( 0, 2, 3, 1).astype("uint8") test_data = test_data.reshape(10000, 3, 32, 32).transpose( 0, 2, 3, 1).astype("uint8")
Why does CNN with constant initialization learn at all?
Usually, weights for neural networks are initialized randomly so that they receive different gradients and learn different weights. In theory, if all weights are initialized the same way, all nodes will have the same weights no matter how long you train. Thus the training shouldn't work at all. However, the code below gives 56% accuracy on MNIST after 7000 epochs. Why is that the case? Code #!/usr/bin/env python """MNIST with Tensorflow.""" from tensorflow.examples.tutorials.mnist import input_data import tensorflow as tf import os import numpy as np epochs = 20000 model_checkpoint_path = 'checkpoints/mnist_tf_model.ckpt' def weight_variable(shape): #initial = tf.truncated_normal(shape, stddev=0.01) initial = tf.constant(0.0, shape=shape) return tf.get_variable(initializer=initial, name='weights') def bias_variable(shape): initial = tf.constant(0.1, shape=shape) return tf.get_variable(initializer=initial, name='biases') def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_2x2(x): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') def eval_network(sess, summary_writer, dataset, correct_prediction, epoch): correct_sum = 0 total_test = 0 training_summary = tf.get_default_graph().get_tensor_by_name("training_accuracy:0") loss_summary = tf.get_default_graph().get_tensor_by_name("loss:0") for i in range(dataset.labels.shape[0] / 1000): feed_dict = {x: dataset.images[i * 1000:(i + 1) * 1000], y_: dataset.labels[i * 1000:(i + 1) * 1000]} [test_correct, train_summ, loss_summ] = sess.run([correct_prediction, training_summary, loss_summary], feed_dict=feed_dict) summary_writer.add_summary(train_summ, epoch) summary_writer.add_summary(loss_summ, epoch) test_correct = correct_prediction.eval(feed_dict=feed_dict) correct_sum += sum(test_correct) total_test += len(test_correct) return float(correct_sum) / total_test def log_score(sess, summary_writer, filename, mnist, scoring, epoch): with open(filename, "a") as myfile: train = eval_network(sess, summary_writer, mnist.train, scoring, epoch) test = eval_network(sess, summary_writer, mnist.test, scoring, epoch) myfile.write("%i;%0.6f;%0.6f\n" % (epoch, train, test)) mnist = input_data.read_data_sets('MNIST_data', one_hot=True) with tf.Session() as sess: x = tf.placeholder(tf.float32, shape=[None, 784]) y_ = tf.placeholder(tf.float32, shape=[None, 10]) x_image = tf.reshape(x, [-1, 28, 28, 1]) with tf.variable_scope('conv1') as scope: W_conv1 = weight_variable([5, 5, 1, 32]) b_conv1 = bias_variable([32]) h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1, name='ReLU1') h_pool1 = max_pool_2x2(h_conv1) with tf.variable_scope('conv2') as scope: W_conv2 = weight_variable([5, 5, 32, 64]) b_conv2 = bias_variable([64]) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2, name='ReLU2') h_pool2 = max_pool_2x2(h_conv2) with tf.variable_scope('fc1'): W_fc1 = weight_variable([7 * 7 * 64, 1024]) b_fc1 = bias_variable([1024]) h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) with tf.variable_scope('softmax'): W_fc2 = weight_variable([1024, 10]) b_fc2 = bias_variable([10]) y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2) cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv * 10**-7), reduction_indices=[1])) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.scalar_summary("training_accuracy", accuracy, name="training_accuracy") tf.scalar_summary("loss", cross_entropy, name="loss") summary_writer = tf.train.SummaryWriter('summary_dir', sess.graph) sess.run(tf.initialize_all_variables()) for i in range(epochs): batch = mnist.train.next_batch(50) if i % 100 == 0: log_score(sess, summary_writer, 'validation-curve-accuracy.csv', mnist, correct_prediction, i) train_step.run(feed_dict={x: batch[0], y_: batch[1]}) log_score(sess, summary_writer, 'validation-curve-accuracy.csv', mnist, correct_prediction, epochs) Plots Nr 1 After adding 10**-7 to the tf.log(..) term, the NANs are gone: Nr 2 This is an old plot which did have a problem due to log(0) after 16k epochs. The loss is plotted here. The triangles are NANs. Here is the accuracy - due to the smoothing, it does not directly fall to ~10%.