simple linear regression by tensorflow - machine-learning

I am a beginner in tensorflow and machine learning. I want to try a simple linear regression example by tensorflow.
But the loss can't decrease after 3700 epoch. I don't know what's wrong?
Obviously, we got the W = 3.52, b = 2.8865. So y = 3.52*x + 2.8865. When testing data x = 11, y = 41.6065. But this is error. Because the training data x = 10, y = 48.712.
The code and loss posted in below.
#Goal: predict the house price in 2017 by linear regression method
#Step: 1. load the original data
# 2. define the placeholder and variable
# 3. linear regression method
# 4. launch the graph
from __future__ import print_function
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# 1. load the original data
price = np.asarray([6.757, 12.358, 10.091, 11.618, 14.064,
16.926, 17.673, 22.271, 26.905, 34.742, 48.712])
year = np.asarray([0,1,2,3,4,5,6,7,8,9,10])
n_samples = price.shape[0]
# 2. define the placeholder and variable
x = tf.placeholder("float")
y_ = tf.placeholder("float")
W = tf.Variable(np.random.randn())
b = tf.Variable(np.random.randn())
# 3. linear regression method
y = tf.add(tf.multiply(x, W), b)
loss = tf.reduce_mean(tf.square(y - y_))/(2*n_samples)
training_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
# 4. launch the graph
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(10000):
for (year_epoch, price_epoch) in zip(year, price):
sess.run(training_step, feed_dict = {x: year_epoch, y_: price_epoch})
if (epoch+1) % 50 == 0:
loss_np = sess.run(loss, feed_dict={x: year, y_: price})
print("Epoch: ", '%04d' % (epoch+1), "loss = ", "{:.9f}".format(loss_np), "W = ", sess.run(W), "b = ", sess.run(b))
# print "Training finish"
training_loss = sess.run(loss, feed_dict = {x: year, y_: price})
print("Training cost = ", training_loss, "W = ", sess.run(W), "b = ", sess.run(b), '\n')
And the loss is:
Epoch: 0050 loss = 1.231071353 W = 3.88227 b = 0.289058
Epoch: 0100 loss = 1.207471132 W = 3.83516 b = 0.630129
Epoch: 0150 loss = 1.189429402 W = 3.79423 b = 0.926415
Epoch: 0200 loss = 1.175611973 W = 3.75868 b = 1.1838
Epoch: 0250 loss = 1.165009260 W = 3.72779 b = 1.40738
Epoch: 0300 loss = 1.156855702 W = 3.70096 b = 1.60161
Epoch: 0350 loss = 1.150570631 W = 3.67766 b = 1.77033
Epoch: 0400 loss = 1.145712137 W = 3.65741 b = 1.9169
Epoch: 0450 loss = 1.141945601 W = 3.63982 b = 2.04422
Epoch: 0500 loss = 1.139016271 W = 3.62455 b = 2.15483
Epoch: 0550 loss = 1.136731029 W = 3.61127 b = 2.25091
Epoch: 0600 loss = 1.134940267 W = 3.59974 b = 2.33437
Epoch: 0650 loss = 1.133531928 W = 3.58973 b = 2.40688
Epoch: 0700 loss = 1.132419944 W = 3.58103 b = 2.46986
Epoch: 0750 loss = 1.131537557 W = 3.57347 b = 2.52458
Epoch: 0800 loss = 1.130834818 W = 3.5669 b = 2.57211
Epoch: 0850 loss = 1.130271792 W = 3.5612 b = 2.6134
Epoch: 0900 loss = 1.129818439 W = 3.55625 b = 2.64927
Epoch: 0950 loss = 1.129452229 W = 3.55194 b = 2.68042
Epoch: 1000 loss = 1.129154325 W = 3.5482 b = 2.70749
Epoch: 1050 loss = 1.128911495 W = 3.54496 b = 2.731
Epoch: 1100 loss = 1.128711581 W = 3.54213 b = 2.75143
Epoch: 1150 loss = 1.128546953 W = 3.53968 b = 2.76917
Epoch: 1200 loss = 1.128411174 W = 3.53755 b = 2.78458
Epoch: 1250 loss = 1.128297567 W = 3.53571 b = 2.79797
Epoch: 1300 loss = 1.128202677 W = 3.5341 b = 2.8096
Epoch: 1350 loss = 1.128123403 W = 3.5327 b = 2.81971
Epoch: 1400 loss = 1.128056765 W = 3.53149 b = 2.82849
Epoch: 1450 loss = 1.128000259 W = 3.53044 b = 2.83611
Epoch: 1500 loss = 1.127952814 W = 3.52952 b = 2.84274
Epoch: 1550 loss = 1.127912283 W = 3.52873 b = 2.84849
Epoch: 1600 loss = 1.127877355 W = 3.52804 b = 2.85349
Epoch: 1650 loss = 1.127847791 W = 3.52744 b = 2.85783
Epoch: 1700 loss = 1.127822518 W = 3.52692 b = 2.8616
Epoch: 1750 loss = 1.127801418 W = 3.52646 b = 2.86488
Epoch: 1800 loss = 1.127782702 W = 3.52607 b = 2.86773
Epoch: 1850 loss = 1.127766728 W = 3.52573 b = 2.8702
Epoch: 1900 loss = 1.127753139 W = 3.52543 b = 2.87234
Epoch: 1950 loss = 1.127740979 W = 3.52517 b = 2.87421
Epoch: 2000 loss = 1.127731323 W = 3.52495 b = 2.87584
Epoch: 2050 loss = 1.127722263 W = 3.52475 b = 2.87725
Epoch: 2100 loss = 1.127714872 W = 3.52459 b = 2.87847
Epoch: 2150 loss = 1.127707958 W = 3.52444 b = 2.87953
Epoch: 2200 loss = 1.127702117 W = 3.52431 b = 2.88045
Epoch: 2250 loss = 1.127697825 W = 3.5242 b = 2.88126
Epoch: 2300 loss = 1.127693415 W = 3.52411 b = 2.88195
Epoch: 2350 loss = 1.127689362 W = 3.52402 b = 2.88255
Epoch: 2400 loss = 1.127686620 W = 3.52395 b = 2.88307
Epoch: 2450 loss = 1.127683759 W = 3.52389 b = 2.88352
Epoch: 2500 loss = 1.127680898 W = 3.52383 b = 2.88391
Epoch: 2550 loss = 1.127679348 W = 3.52379 b = 2.88425
Epoch: 2600 loss = 1.127677798 W = 3.52374 b = 2.88456
Epoch: 2650 loss = 1.127675653 W = 3.52371 b = 2.88483
Epoch: 2700 loss = 1.127674222 W = 3.52368 b = 2.88507
Epoch: 2750 loss = 1.127673268 W = 3.52365 b = 2.88526
Epoch: 2800 loss = 1.127672315 W = 3.52362 b = 2.88543
Epoch: 2850 loss = 1.127671123 W = 3.5236 b = 2.88559
Epoch: 2900 loss = 1.127670288 W = 3.52358 b = 2.88572
Epoch: 2950 loss = 1.127670050 W = 3.52357 b = 2.88583
Epoch: 3000 loss = 1.127669215 W = 3.52356 b = 2.88592
Epoch: 3050 loss = 1.127668500 W = 3.52355 b = 2.88599
Epoch: 3100 loss = 1.127668381 W = 3.52354 b = 2.88606
Epoch: 3150 loss = 1.127667665 W = 3.52353 b = 2.88615
Epoch: 3200 loss = 1.127667546 W = 3.52352 b = 2.88621
Epoch: 3250 loss = 1.127667069 W = 3.52351 b = 2.88626
Epoch: 3300 loss = 1.127666950 W = 3.5235 b = 2.8863
Epoch: 3350 loss = 1.127666354 W = 3.5235 b = 2.88633
Epoch: 3400 loss = 1.127666593 W = 3.5235 b = 2.88637
Epoch: 3450 loss = 1.127666593 W = 3.52349 b = 2.8864
Epoch: 3500 loss = 1.127666235 W = 3.52349 b = 2.88644
Epoch: 3550 loss = 1.127665997 W = 3.52348 b = 2.88646
Epoch: 3600 loss = 1.127665639 W = 3.52348 b = 2.88648
Epoch: 3650 loss = 1.127665639 W = 3.52348 b = 2.88649
Epoch: 3700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 3950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 4950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 5950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 6950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 7950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 8950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9000 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9050 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9100 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9150 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9200 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9250 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9300 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9350 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9400 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9450 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9500 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9550 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9600 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9650 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9700 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9750 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9800 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9850 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9900 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 9950 loss = 1.127665997 W = 3.52348 b = 2.8865
Epoch: 10000 loss = 1.127665997 W = 3.52348 b = 2.8865
Training cost = 1.12767 W = 3.52348 b = 2.8865

Your hypothesis of assuming the predicted output lies in a straight line is not correct. Check how the plot of year and price is.
So the linear hypothesis which you have taken will try it's best to fit in a straight line by satisfying as many input points as possible to reduce the cost. So when you are testing for a point which is outside the range, it will predict in the straight line which is the best optimized for the set of inputs you provided.
Now, you have mentioned two problems.
1. Cost is not going down: Try reducing the learning rate. Your cost will definitely go down.
2. Your output for year = 11 is wrong: The reason I have mentioned already above. What you need to do is you have to change the hypothesis. Include a square term and then check it. Example: y = ax^2 + bx + c. You will get a better fit with this hypothesis equation.

Related

Neural network cost not decreasing with gradient descent

Im working on a small neural network in python and i'm having issues figuring out why the cost doesn't go down. Any ideas/hints would be appreciated
import numpy as np
X = np.array([[0, 0],
[0, 1],
[1, 0],
[1, 1]])
Y = np.array([[0, 1, 1, 0]])
w1 = np.array([[0.1, 0.2],[0.3,0.4],[0.5,0.6]])
b1 = np.zeros((3,1))
w2 = np.array([0.1, 0.2, 0.3]).reshape(1,3)
b2 = np.zeros((1,1))
cache = {}
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_prime(x):
return sigmoid(x) * (1 - sigmoid(x))
def forward(X, w1, w2, b1, b2):
Z1 = np.dot(w1, X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
return {'x': X, 'Z1': Z1, 'A1': A1, 'Z2': Z2, 'A2': A2}
def backward(X, Z1, A1, Z2, A2, error_gradient, w1, w2, b1, b2, learning_rate=0.01):
# LAYER 2
dA2 = np.multiply(error_gradient, sigmoid_prime(Z2))
dZ2 = np.dot(w2.T, dA2)
# update w and b for layer 2
dw2 = np.dot(dA2, A2.T)
db2 = dA2
w2 -= dw2 * learning_rate
b2 -= db2 * learning_rate
# LAYER 1
dA1 = np.multiply(dZ2, sigmoid_prime(Z1))
dZ1 = np.dot(w1.T, dA1)
# update w and b for layer 1
dw1 = np.dot(dA1, X.T)
db1 = dA1
w1 -= dw1 * learning_rate
b1 -= db1 * learning_rate
return {'x': X, 'dZ1': dZ1, 'dA1': dA1, 'dZ2': dZ2, 'dA2': dA2,
'w2': w2, 'b2': b2, 'w1': w1, 'b1': b1}
def calculate_cost(y, y_guess):
cost = np.power(y - y_guess, 2)
return np.squeeze(cost)
def mse_prime(y, y_pred):
return 2 * (y - y_pred)
def predict(X, w1, w2, b1, b2):
return forward(X, w1, w2, b1, b2)
def train(X, Y, w1, w2, b1, b2, epochs=100, learning_rate=0.01):
for epoch in range(epochs):
cost = 0
for i, val in enumerate(X):
x = val.reshape(2,1)
out = predict(x, w1, w2, b1, b2)
y_guess = out["A2"]
#print(out)
cost += calculate_cost(Y[0][i], y_guess)
error_gradient = mse_prime(Y[0][i], y_guess)
# print(error_gradient)
back = backward(x, out["Z1"], out["A1"], out["Z2"], out["A2"], error_gradient, w1, w2, b1, b2)
# update params
w1 = back["w1"]
b1 = back["b1"]
w2 = back["w2"]
b2 = back["b2"]
print(f"epoch: {epoch + 1}/{epochs}, cost: {cost/X.shape[0]}")
train(X, Y, w1, w2, b1, b2, epochs=20)
Cost output
epoch: 1/20, cost: 0.25703296560961486
epoch: 2/20, cost: 0.25718506279033615
epoch: 3/20, cost: 0.25734002245320176
epoch: 4/20, cost: 0.25749789408142415
epoch: 5/20, cost: 0.25765872780276317
epoch: 6/20, cost: 0.25782257438803613
epoch: 7/20, cost: 0.25798948524907084
epoch: 8/20, cost: 0.2581595124360765
epoch: 9/20, cost: 0.2583327086344036
epoch: 10/20, cost: 0.25850912716066776
epoch: 11/20, cost: 0.2586888219582088
epoch: 12/20, cost: 0.25887184759185666
epoch: 13/20, cost: 0.2590582592419748
epoch: 14/20, cost: 0.2592481126977533
epoch: 15/20, cost: 0.2594414643497189
epoch: 16/20, cost: 0.25963837118143357
epoch: 17/20, cost: 0.2598388907603498
epoch: 18/20, cost: 0.2600430812277913
epoch: 19/20, cost: 0.2602510012880266
epoch: 20/20, cost: 0.26046271019640493
Turns out I had these 2 errors
MSE function: Change order in substraction (the order does matter)
dw2 dot product using correct matrix (A1)
here is the fully working code
import numpy as np
np.random.seed(1)
X = np.array([[0, 0],
[0, 1],
[1, 0],
[1, 1]])
#X = np.array([[0, 0]])
Y = np.array([[0, 1, 1, 0]])
w1 = np.random.randn(3, 2) * 0.1
b1 = np.zeros((3,1))
w2 = np.random.randn(1, 3) * 0.1
b2 = np.zeros((1,1))
cache = {}
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_prime(x):
return sigmoid(x) * (1 - sigmoid(x))
def forward(X, w1, w2, b1, b2):
Z1 = np.dot(w1, X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
return {'x': X, 'Z1': Z1, 'A1': A1, 'Z2': Z2, 'A2': A2}
def backward(X, Z1, A1, Z2, A2, error_gradient, w1, w2, b1, b2, learning_rate=0.1):
# LAYER 2
dA2 = np.multiply(error_gradient, sigmoid_prime(Z2))
dZ2 = np.dot(w2.T, dA2)
# update w and b for layer 2
dw2 = np.dot(dA2, A1.T)
db2 = dA2
w2 -= dw2 * learning_rate
b2 -= db2 * learning_rate
# LAYER 1
dA1 = np.multiply(dZ2, sigmoid_prime(Z1))
dZ1 = np.dot(w1.T, dA1)
# update w and b for layer 1
dw1 = np.dot(dA1, X.T)
db1 = dA1
w1 -= dw1 * learning_rate
#print(db1 * learning_rate)
b1 -= db1 * learning_rate
return {'x': X, 'dZ1': dZ1, 'dA1': dA1, 'dZ2': dZ2, 'dA2': dA2,
'w2': w2, 'b2': b2, 'w1': w1, 'b1': b1, 'dw2': dw2, 'db2': db2, 'dw1': dw1, 'db1': db1}
def calculate_cost(y, y_guess):
cost = np.power(y - y_guess, 2)
return np.squeeze(cost)
def mse_prime(y, y_pred):
return 2 * (y_pred - y)
def predict(X, w1, w2, b1, b2):
return forward(X, w1, w2, b1, b2)
def train(X, Y, w1, w2, b1, b2, epochs=100, learning_rate=0.01):
for epoch in range(epochs):
cost = 0
for i, val in enumerate(X):
x = val.reshape(2,1)
out = predict(x, w1, w2, b1, b2)
y_guess = out["A2"]
cost += calculate_cost(Y[0][i], y_guess)
error_gradient = mse_prime(Y[0][i], y_guess)
back = backward(x, out["Z1"], out["A1"], out["Z2"], out["A2"], error_gradient, w1, w2, b1, b2)
# update params
w1 = back["w1"]
b1 = back["b1"]
w2 = back["w2"]
b2 = back["b2"]
print(f"epoch: {epoch + 1}/{epochs}, cost: {cost/X.shape[0]}")
train(X, Y, w1, w2, b1, b2, epochs=10000, learning_rate=0.1)

Pytorch model doesn't learn identity function?

I wrote some models in pytorch which was not able to learn anything even after many epochs. In order to debug the problem I made a simple model which models identity function of an input. The difficulty is this model also doesn't learn nothing despite training for 50k epochs,
import torch
import torch.nn as nn
torch.manual_seed(1)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.input = nn.Linear(2,4)
self.hidden = nn.Linear(4,4)
self.output = nn.Linear(4,2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.dropout = nn.Dropout(0.5)
def forward(self,x):
x = self.input(x)
x = self.dropout(x)
x = self.relu(x)
x = self.hidden(x)
x = self.dropout(x)
x = self.relu(x)
x = self.output(x)
x = self.softmax(x)
return x
X = torch.tensor([[1,0],[1,0],[0,1],[0,1]],dtype=torch.float)
net = Net()
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(net.parameters(), lr=0.001)
for i in range(100000):
opt.zero_grad()
y = net(X)
loss = criterion(y,torch.argmax(X,dim=1))
loss.backward()
if i%500 ==0:
print("Epoch: ",i)
print(torch.argmax(y,dim=1).detach().numpy().tolist())
print("Loss: ",loss.item())
print()
Output
Epoch: 52500
[0, 0, 1, 0]
Loss: 0.6554909944534302
Epoch: 53000
[0, 0, 0, 0]
Loss: 0.7004914283752441
Epoch: 53500
[0, 0, 0, 0]
Loss: 0.7156486511230469
Epoch: 54000
[0, 0, 0, 0]
Loss: 0.7171240448951721
Epoch: 54500
[0, 0, 0, 0]
Loss: 0.691678524017334
Epoch: 55000
[0, 0, 0, 0]
Loss: 0.7301554679870605
Epoch: 55500
[0, 0, 0, 0]
Loss: 0.728650689125061
What is wrong with my implementation?
There are a few mistakes:
Missing optimizer.step():
optimizer.step() updates the parameters based on backpropagated gradients and other accumulated momentum and all.
Usage of softmax with CrossEntropy Loss:
Pytorch CrossEntropyLoss criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class. i.e. it applies softmax then takes negative log. So in your case you are taking softmax(softmax(output)). Correct way is use linear output layer while training and use softmax layer or just take argmax for prediction.
High dropout value for small network:
Which results in underfitting.
Here's the corrected code:
import torch
import torch.nn as nn
torch.manual_seed(1)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.input = nn.Linear(2,4)
self.hidden = nn.Linear(4,4)
self.output = nn.Linear(4,2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
# self.dropout = nn.Dropout(0.0)
def forward(self,x):
x = self.input(x)
# x = self.dropout(x)
x = self.relu(x)
x = self.hidden(x)
# x = self.dropout(x)
x = self.relu(x)
x = self.output(x)
# x = self.softmax(x)
return x
def predict(self, x):
with torch.no_grad():
out = self.forward(x)
return self.softmax(out)
X = torch.tensor([[1,0],[1,0],[0,1],[0,1]],dtype=torch.float)
net = Net()
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(net.parameters(), lr=0.001)
for i in range(100000):
opt.zero_grad()
y = net(X)
loss = criterion(y,torch.argmax(X,dim=1))
loss.backward()
# This was missing before
opt.step()
if i%500 ==0:
print("Epoch: ",i)
pred = net.predict(X)
print(f'prediction: {torch.argmax(pred, dim=1).detach().numpy().tolist()}, actual: {torch.argmax(X,dim=1)}')
print("Loss: ", loss.item())
Output:
Epoch: 0
prediction: [0, 0, 0, 0], actual: tensor([0, 0, 1, 1])
Loss: 0.7042869329452515
Epoch: 500
prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1])
Loss: 0.1166711300611496
Epoch: 1000
prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1])
Loss: 0.05215628445148468
Epoch: 1500
prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1])
Loss: 0.02993333339691162
Epoch: 2000
prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1])
Loss: 0.01916157826781273
Epoch: 2500
prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1])
Loss: 0.01306679006665945
Epoch: 3000
prediction: [0, 0, 1, 1], actual: tensor([0, 0, 1, 1])
Loss: 0.009280549362301826
.
.
.

Tensorflow multi-GPU MNIST classifier: low accuracy

I am stuck with multiple GPU MNIST classifier in Tensorflow. Code runs without errors, but accuracy is very poor (30%). I am new to Tensorflow so I do not know where is the problem ? GPU: 2x GTX 1080 Ti.
I have found several tutorials for multiple GPU, but code is hard to follow. For this reason I am trying to develop MNIST CNN classifier from scratch.
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import datetime
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, [None, 784], name='x')
x_img=tf.reshape(x, [-1, 28, 28, 1])
x_dict={}
x_dict['x0'],x_dict['x1'] = tf.split(x_img,2)
y_dict={}
y = tf.placeholder(tf.float32, [None, 10], name='y')
y_dict['y0'],y_dict['y1'] = tf.split(y,2)
opt=tf.train.GradientDescentOptimizer(0.01)
keep_prob = tf.placeholder(tf.float32)
w0=tf.get_variable('w0',initializer=tf.truncated_normal([5, 5,1,32], stddev=0.1))
b0=tf.get_variable('b0',initializer=tf.zeros([32]))
w1=tf.get_variable('w1',initializer=tf.truncated_normal([5,5,32,64], stddev=0.1))
b1=tf.get_variable('b1',initializer=tf.zeros([64]))
w2=tf.get_variable('w2',initializer=tf.truncated_normal([7*7*64,1024], stddev=0.1))
b2=tf.get_variable('b2',initializer=tf.zeros([1024]))
w3=tf.get_variable('w3',initializer=tf.truncated_normal([1024,10], stddev=0.1))
b3=tf.get_variable('b3',initializer=tf.zeros([10]))
grads=[]
def conv2d(xx, W):
return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(xx):
return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1],strides=[1, 2, 2, 1], padding='SAME')
def model_forward(xx):
h_conv1=tf.nn.relu(conv2d(xx,w0)+b0);
h_pool1=max_pool_2x2(h_conv1)
h_conv2=tf.nn.relu(conv2d(h_pool1,w1)+b1);
h_pool2=max_pool_2x2(h_conv2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,w2)+b2)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y = tf.nn.sigmoid(tf.matmul(h_fc1_drop,w3)+b3)
return y
for i in range(0,2):
with tf.device(('/gpu:{0}').format(i)):
with tf.variable_scope(('scope_gpu_{0}').format(i)):
yy=model_forward(x_dict[('x{0}').format(i)])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1]))
grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables()))
with tf.device('/cpu:0'):
grad = average_gradients(grads)
train_step = opt.apply_gradients(grad)
yy=model_forward(x_dict['x0'])
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y_dict['y0'], 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
def main():
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter('C:\\tmp\\test\\', graph=tf.get_default_graph())
t1_1 = datetime.datetime.now()
for step in range(0,10000):
batch_x, batch_y = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5})
if (step % 200) == 0:
print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1}))
t2_1 = datetime.datetime.now()
print("Computation time: " + str(t2_1-t1_1))
if __name__ == "__main__":
main()
The problems that I noticed:
Your cross-entropy loss is wrong (see this question for details, in short you're computing binary cross-entropy).
I dropped manual gradient computation in favor of tf.train.AdamOptimizer.
I dropped the split of the input of x (it's not the right way to do distributed computation in tensorflow).
The result model easily gets to 99% accuracy even on one GPU.
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import datetime
x = tf.placeholder(tf.float32, [None, 784], name='x')
x_img = tf.reshape(x, [-1, 28, 28, 1])
y = tf.placeholder(tf.float32, [None, 10], name='y')
keep_prob = tf.placeholder(tf.float32)
stddev = 0.1
w0 = tf.get_variable('w0', initializer=tf.truncated_normal([5, 5, 1, 32], stddev=stddev))
b0 = tf.get_variable('b0', initializer=tf.zeros([32]))
w1 = tf.get_variable('w1', initializer=tf.truncated_normal([5, 5, 32, 64], stddev=stddev))
b1 = tf.get_variable('b1', initializer=tf.zeros([64]))
w2 = tf.get_variable('w2', initializer=tf.truncated_normal([7 * 7 * 64, 1024], stddev=stddev))
b2 = tf.get_variable('b2', initializer=tf.zeros([1024]))
w3 = tf.get_variable('w3', initializer=tf.truncated_normal([1024, 10], stddev=stddev))
b3 = tf.get_variable('b3', initializer=tf.zeros([10]))
def conv2d(xx, W):
return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(xx):
return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def model_forward(xx):
h_conv1 = tf.nn.relu(conv2d(xx, w0) + b0)
h_pool1 = max_pool_2x2(h_conv1)
h_conv2 = tf.nn.relu(conv2d(h_pool1, w1) + b1)
h_pool2 = max_pool_2x2(h_conv2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w2) + b2)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y = tf.matmul(h_fc1_drop, w3) + b3
return y
yy = model_forward(x_img)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=yy, labels=y))
train_step = tf.train.AdamOptimizer().minimize(loss)
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
def main():
mnist = input_data.read_data_sets("/home/maxim/p/data/mnist-tf", one_hot=True)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.run(tf.global_variables_initializer())
t1_1 = datetime.datetime.now()
for step in range(0, 10000):
batch_x, batch_y = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5})
if (step % 200) == 0:
print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1}))
t2_1 = datetime.datetime.now()
print("Computation time: " + str(t2_1 - t1_1))
if __name__ == "__main__":
main()
Now, if you really want it, you can do data or model parallelism to utilize your GPU power (there is a great post about it, but sometimes it doesn't render correctly due to hosting problems).
Along with the points mentioned in the first two answers, take a look at return average_grads in average_gradients function, it's returning from the 1st iteration of the first for loop, meaning the gradients will only apply to the first variable (probably w0). Hence only w0 is getting updated and so you are getting a very low accuracy since the rest of the variables stay to their original values (either random/zeros).
This is because the model is not using the same weights & biases for inference on CPU as well as on the other GPU devices.
For example:
for i in range(0,2):
with tf.device(('/gpu:{0}').format(i)):
with tf.variable_scope(('scope_gpu_{0}').format(i)) as infer_scope:
yy=model_forward(x_dict[('x{0}').format(i)])
infer_scope.reuse_variables()
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1]))
grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables()))
The reason you are getting low accuracy is that without specifying reuse_variables() and you try to call the model inference inside each epoch, the graph would create a new model with random weights & biases initialization, which is not what you favored.

CNN for cifar10 dataset in Tensorflow

I am trying to replicate results obtained by a convolutional neural network for CIFAR10 using Tensorflow, however after some epochs (~60 epochs) my performance (accuracy) is around 10%, so I do not if the CNN is well trained?
This code is based on Deep mnist for experts https://www.tensorflow.org/get_started/mnist/pros , however in Cifar10 it does not work
import numpy as np
import tensorflow as tf
def unpickle(file):
import cPickle
fo = open(file, 'rb')
dict = cPickle.load(fo)
fo.close()
return dict
#unpacking training and test data
b1 = unpickle("~/cifar-10-batches-py/data_batch_1")
b2 = unpickle("~/cifar-10-batches-py/data_batch_2")
b3 = unpickle("~/cifar-10-batches-py/data_batch_3")
b4 = unpickle("~/cifar-10-batches-py/data_batch_4")
b5 = unpickle("~/cifar-10-batches-py/data_batch_5")
test = unpickle("~/cifar-10-batches-py/test_batch")
#Preparing test data
test_data = test['data']
test_label = test['labels']
#Preparing training data
train_data = np.concatenate([b1['data'],b2['data'],b3['data'],b4['data'],b5['data']],axis=0)
train_label = np.concatenate([b1['labels'],b2['labels'],b3['labels'],b4['labels'],b5['labels']],axis=0)
#Reshaping data
train_data = np.reshape(train_data,[50000,32,32,3])
test_data = np.reshape(test_data,[10000,32,32,3])
batch_size = 100
image_width = 32
image_height = 32
channels = 3
#Constructing Graph
x = tf.placeholder(tf.float32, [None, image_width, image_height, channels])#Training Data
y = tf.placeholder(tf.int32, [None])
one_hot = tf.one_hot(y,depth=10)#Converting in one hot vectors
#Constructing CNN Layers
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
#Given an input tensor of shape [batch, in_height, in_width, in_channels] and a filter / kernel tensor of shape [filter_height, filter_width, in_channels, out_channels], taken from: http://textminingonline.com/dive-into-tensorflow-part-v-deep-mnist
W_conv1 = weight_variable([7, 7, 3, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 32])
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_conv3 = weight_variable([5, 5, 32, 64])
b_conv3 = bias_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
#Constructing MLP layers
W_fc1 = weight_variable([8 * 8 * 64, 64])
b_fc1 = bias_variable([64])
h_pool3_flat = tf.reshape(h_conv3, [-1, 8*8*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
W_fc2 = weight_variable([64, 10])
b_fc2 = bias_variable([10])
y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
#Computing Cost function
cross_entropy = -tf.reduce_sum(one_hot*tf.log(tf.clip_by_value(y_conv,1e-10,1e20)))
train_step = tf.train.MomentumOptimizer(learning_rate = 0.0001, momentum = 0.9).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(one_hot,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.initialize_all_variables()
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=16))
sess.run(init)
epochs = 100
b_per = 0
row = []
for e in range(epochs):
print( "epoch", e)
avg_cost = 0
#foreach batch
for j in range(int(train_data.shape[0]/batch_size)):
subset=range((j*batch_size),((j+1)*batch_size))
data = train_data[subset,:,:,:]
label = train_label[subset]
_,c = sess.run([train_step,cross_entropy], feed_dict={x: data, y: label})
avg_cost += c / data.shape[0]
#print(avg_cost)
b_per = b_per + 1
if b_per%10==0 :
row.append(sess.run(accuracy, feed_dict={x: test_data, y: test_label }))
print(row[-1])
It is wrong in data reshape part! It should be,
# Reshaping data
train_data = train_data.reshape(50000, 3, 32, 32).transpose(
0, 2, 3, 1).astype("uint8")
test_data = test_data.reshape(10000, 3, 32, 32).transpose(
0, 2, 3, 1).astype("uint8")

Why does CNN with constant initialization learn at all?

Usually, weights for neural networks are initialized randomly so that they receive different gradients and learn different weights. In theory, if all weights are initialized the same way, all nodes will have the same weights no matter how long you train. Thus the training shouldn't work at all.
However, the code below gives 56% accuracy on MNIST after 7000 epochs. Why is that the case?
Code
#!/usr/bin/env python
"""MNIST with Tensorflow."""
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import os
import numpy as np
epochs = 20000
model_checkpoint_path = 'checkpoints/mnist_tf_model.ckpt'
def weight_variable(shape):
#initial = tf.truncated_normal(shape, stddev=0.01)
initial = tf.constant(0.0, shape=shape)
return tf.get_variable(initializer=initial, name='weights')
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.get_variable(initializer=initial, name='biases')
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
def eval_network(sess, summary_writer, dataset, correct_prediction, epoch):
correct_sum = 0
total_test = 0
training_summary = tf.get_default_graph().get_tensor_by_name("training_accuracy:0")
loss_summary = tf.get_default_graph().get_tensor_by_name("loss:0")
for i in range(dataset.labels.shape[0] / 1000):
feed_dict = {x: dataset.images[i * 1000:(i + 1) * 1000],
y_: dataset.labels[i * 1000:(i + 1) * 1000]}
[test_correct, train_summ, loss_summ] = sess.run([correct_prediction,
training_summary,
loss_summary],
feed_dict=feed_dict)
summary_writer.add_summary(train_summ, epoch)
summary_writer.add_summary(loss_summ, epoch)
test_correct = correct_prediction.eval(feed_dict=feed_dict)
correct_sum += sum(test_correct)
total_test += len(test_correct)
return float(correct_sum) / total_test
def log_score(sess, summary_writer, filename, mnist, scoring, epoch):
with open(filename, "a") as myfile:
train = eval_network(sess, summary_writer, mnist.train, scoring, epoch)
test = eval_network(sess, summary_writer, mnist.test, scoring, epoch)
myfile.write("%i;%0.6f;%0.6f\n" % (epoch, train, test))
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
with tf.Session() as sess:
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
x_image = tf.reshape(x, [-1, 28, 28, 1])
with tf.variable_scope('conv1') as scope:
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1, name='ReLU1')
h_pool1 = max_pool_2x2(h_conv1)
with tf.variable_scope('conv2') as scope:
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2, name='ReLU2')
h_pool2 = max_pool_2x2(h_conv2)
with tf.variable_scope('fc1'):
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
with tf.variable_scope('softmax'):
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv * 10**-7),
reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.scalar_summary("training_accuracy", accuracy, name="training_accuracy")
tf.scalar_summary("loss", cross_entropy, name="loss")
summary_writer = tf.train.SummaryWriter('summary_dir', sess.graph)
sess.run(tf.initialize_all_variables())
for i in range(epochs):
batch = mnist.train.next_batch(50)
if i % 100 == 0:
log_score(sess, summary_writer,
'validation-curve-accuracy.csv',
mnist, correct_prediction, i)
train_step.run(feed_dict={x: batch[0],
y_: batch[1]})
log_score(sess, summary_writer, 'validation-curve-accuracy.csv',
mnist, correct_prediction, epochs)
Plots
Nr 1
After adding 10**-7 to the tf.log(..) term, the NANs are gone:
Nr 2
This is an old plot which did have a problem due to log(0) after 16k epochs.
The loss is plotted here. The triangles are NANs.
Here is the accuracy - due to the smoothing, it does not directly fall to ~10%.

Resources