Why aren't my weights being updated?

Why aren't my weights being updated? - machine-learning

I'm trying this very simple neural net which tells if a number is odd or even.
labels: [1, 0] means it's even. I'm using two output neuron because I'm using softmax function.
My code:
import tensorflow as tf
data_in = [
[1],
[2],
[3]
]
data_lbl = [
[0, 1],
[1, 0],
[0, 1]
]
# HP
learning_rate = 0.1
epochs = 10000
ip = tf.placeholder('float', [None, 1])
labels = tf.placeholder('float', [None, 2])
w1 = tf.Variable(tf.random_normal([1, 2]))
w2 = tf.Variable(tf.random_normal([2, 2]))
l1 = tf.matmul(ip, w1)
l2 = tf.matmul(l1, w2)
l2 = tf.nn.softmax(l2)
loss = tf.reduce_mean((labels - l2)**2)
train = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch in range(epochs):
_, err = sess.run([train, loss], feed_dict={ip: data_in, labels: data_lbl})
print(err)
print(sess.run(l2, feed_dict={ip: [[2], [5], [7]]}))
# [it is, it's not]
# 1 = even
sess.close()
My error is not changing and I'm getting wrong answers. Suggestions?

You have multiple issues here, fixing those should at least give you something that learns something:
You don't have any nonlinearities in your network other than the final softmax. You need nonlinearities, as parity is not a linear function.
Your intermediate layers are quite small.
Your training samples are very limited.
You don't have biases.
In addition, parity is a concept that is very hard to learn so it generalizes to numbers not seen in the training set.

Related

Tensorflow multi-GPU MNIST classifier: low accuracy

I am stuck with multiple GPU MNIST classifier in Tensorflow. Code runs without errors, but accuracy is very poor (30%). I am new to Tensorflow so I do not know where is the problem ? GPU: 2x GTX 1080 Ti.
I have found several tutorials for multiple GPU, but code is hard to follow. For this reason I am trying to develop MNIST CNN classifier from scratch.
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import datetime
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, [None, 784], name='x')
x_img=tf.reshape(x, [-1, 28, 28, 1])
x_dict={}
x_dict['x0'],x_dict['x1'] = tf.split(x_img,2)
y_dict={}
y = tf.placeholder(tf.float32, [None, 10], name='y')
y_dict['y0'],y_dict['y1'] = tf.split(y,2)
opt=tf.train.GradientDescentOptimizer(0.01)
keep_prob = tf.placeholder(tf.float32)
w0=tf.get_variable('w0',initializer=tf.truncated_normal([5, 5,1,32], stddev=0.1))
b0=tf.get_variable('b0',initializer=tf.zeros([32]))
w1=tf.get_variable('w1',initializer=tf.truncated_normal([5,5,32,64], stddev=0.1))
b1=tf.get_variable('b1',initializer=tf.zeros([64]))
w2=tf.get_variable('w2',initializer=tf.truncated_normal([7*7*64,1024], stddev=0.1))
b2=tf.get_variable('b2',initializer=tf.zeros([1024]))
w3=tf.get_variable('w3',initializer=tf.truncated_normal([1024,10], stddev=0.1))
b3=tf.get_variable('b3',initializer=tf.zeros([10]))
grads=[]
def conv2d(xx, W):
return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(xx):
return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1],strides=[1, 2, 2, 1], padding='SAME')
def model_forward(xx):
h_conv1=tf.nn.relu(conv2d(xx,w0)+b0);
h_pool1=max_pool_2x2(h_conv1)
h_conv2=tf.nn.relu(conv2d(h_pool1,w1)+b1);
h_pool2=max_pool_2x2(h_conv2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,w2)+b2)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y = tf.nn.sigmoid(tf.matmul(h_fc1_drop,w3)+b3)
return y
for i in range(0,2):
with tf.device(('/gpu:{0}').format(i)):
with tf.variable_scope(('scope_gpu_{0}').format(i)):
yy=model_forward(x_dict[('x{0}').format(i)])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1]))
grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables()))
with tf.device('/cpu:0'):
grad = average_gradients(grads)
train_step = opt.apply_gradients(grad)
yy=model_forward(x_dict['x0'])
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y_dict['y0'], 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
def main():
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter('C:\\tmp\\test\\', graph=tf.get_default_graph())
t1_1 = datetime.datetime.now()
for step in range(0,10000):
batch_x, batch_y = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5})
if (step % 200) == 0:
print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1}))
t2_1 = datetime.datetime.now()
print("Computation time: " + str(t2_1-t1_1))
if __name__ == "__main__":
main()

The problems that I noticed:
Your cross-entropy loss is wrong (see this question for details, in short you're computing binary cross-entropy).
I dropped manual gradient computation in favor of tf.train.AdamOptimizer.
I dropped the split of the input of x (it's not the right way to do distributed computation in tensorflow).
The result model easily gets to 99% accuracy even on one GPU.
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import datetime
x = tf.placeholder(tf.float32, [None, 784], name='x')
x_img = tf.reshape(x, [-1, 28, 28, 1])
y = tf.placeholder(tf.float32, [None, 10], name='y')
keep_prob = tf.placeholder(tf.float32)
stddev = 0.1
w0 = tf.get_variable('w0', initializer=tf.truncated_normal([5, 5, 1, 32], stddev=stddev))
b0 = tf.get_variable('b0', initializer=tf.zeros([32]))
w1 = tf.get_variable('w1', initializer=tf.truncated_normal([5, 5, 32, 64], stddev=stddev))
b1 = tf.get_variable('b1', initializer=tf.zeros([64]))
w2 = tf.get_variable('w2', initializer=tf.truncated_normal([7 * 7 * 64, 1024], stddev=stddev))
b2 = tf.get_variable('b2', initializer=tf.zeros([1024]))
w3 = tf.get_variable('w3', initializer=tf.truncated_normal([1024, 10], stddev=stddev))
b3 = tf.get_variable('b3', initializer=tf.zeros([10]))
def conv2d(xx, W):
return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(xx):
return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def model_forward(xx):
h_conv1 = tf.nn.relu(conv2d(xx, w0) + b0)
h_pool1 = max_pool_2x2(h_conv1)
h_conv2 = tf.nn.relu(conv2d(h_pool1, w1) + b1)
h_pool2 = max_pool_2x2(h_conv2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w2) + b2)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y = tf.matmul(h_fc1_drop, w3) + b3
return y
yy = model_forward(x_img)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=yy, labels=y))
train_step = tf.train.AdamOptimizer().minimize(loss)
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
def main():
mnist = input_data.read_data_sets("/home/maxim/p/data/mnist-tf", one_hot=True)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.run(tf.global_variables_initializer())
t1_1 = datetime.datetime.now()
for step in range(0, 10000):
batch_x, batch_y = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5})
if (step % 200) == 0:
print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1}))
t2_1 = datetime.datetime.now()
print("Computation time: " + str(t2_1 - t1_1))
if __name__ == "__main__":
main()
Now, if you really want it, you can do data or model parallelism to utilize your GPU power (there is a great post about it, but sometimes it doesn't render correctly due to hosting problems).

Along with the points mentioned in the first two answers, take a look at return average_grads in average_gradients function, it's returning from the 1st iteration of the first for loop, meaning the gradients will only apply to the first variable (probably w0). Hence only w0 is getting updated and so you are getting a very low accuracy since the rest of the variables stay to their original values (either random/zeros).

This is because the model is not using the same weights & biases for inference on CPU as well as on the other GPU devices.
For example:
for i in range(0,2):
with tf.device(('/gpu:{0}').format(i)):
with tf.variable_scope(('scope_gpu_{0}').format(i)) as infer_scope:
yy=model_forward(x_dict[('x{0}').format(i)])
infer_scope.reuse_variables()
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_dict[('y{0}').format(i)] * tf.log(yy), reduction_indices=[1]))
grads.append(opt.compute_gradients(cross_entropy,tf.trainable_variables()))
The reason you are getting low accuracy is that without specifying reuse_variables() and you try to call the model inference inside each epoch, the graph would create a new model with random weights & biases initialization, which is not what you favored.

Can't get higher accuracy than 50%~ on CIFAR10 dataset

I am currently trying to develop a CNN in TensorFlow for th Cifar10 dataset.
So far, I found the best setting for my CNN to be:
Conv1,patch 3x3,32 output
Max pooling 2x2
Conv2,patch 3x3,32 output
max pooling 2x2
Conv3, patch 3x3, 64 output
max pooling 2x2
Flat to array
Fully connected 1024 outputs
Softmax
Here is the code version:
W_conv1 = weight_variable([3, 3, 3, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([3, 3, 32, 32])
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_conv3 = weight_variable([3, 3, 32, 64])
b_conv3 = bias_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
h_pool3 = max_pool_2x2(h_conv3)
h_flat1 = tf.reshape(h_pool3, [-1, 4 * 4 * 64])
W_fc1 = weight_variable([4 * 4 * 64, 1024])
b_fc1 = bias_variable([1024])
h_fc1 = tf.nn.relu(tf.matmul(h_flat1, W_fc1) + b_fc1)
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
h_fc2 = tf.matmul(h_fc1, W_fc2) + b_fc2
I use RMSProp to minimize the cross entropy with a learning rate of 1e-4, batch size of 128, 500 epochs.I also normalized the images by subtracting the mean value, and shuffle the data at every epoch
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.RMSPropOptimizer(1e-4).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.global_variables_initializer())
i=0
j=0
offset=128
loop=int(50000/offset)
for k in range(500):
i=0
j=0
datatotal,datalabel=unison_shuffled_copies(datatotal,datalabel)
for j in range(loop):
batch_xs = datatotal[i:i+offset]
batch_ys = datalabel[i:i+offset]
i=i+offset
train_step.run(feed_dict={x: batch_xs, y_: batch_ys})
print(k,"test accuracy %g"%accuracy.eval(feed_dict={x: testdata, y_: testlabels}))
I have tried many different optimizers ( adam, adadelta, SGD, RMS ) and tried adding or removing various layers, including FC, dropout, and convolutional.
One of my best result is with the previous setup, which at about the 20th epoch gets stuck at 42% accuracy on the test data.
I even tried implementing the very same network found on this website: http://nghiaho.com/?p=1913 without having the same results.
Also tried implementing other networks from other websites, with also poor results.
How can I improve my results?
EDIT: Changed max pooling to average pooling,got 55% accuracy on 25th epoch. Better, but not quite right.

Tensorflow: Visualizing trained weights for linear classifier on MNIST dataset

I have trained a linear classifier on the MNIST dataset with 92% accuracy. Then I fixed the weights and optimized the input image such that softmax probability for 8 was maximized. But the softmax loss doesn't decrease below 2.302 (-log(1/10)) which means that my training has been useless. What am I doing wrong?
Code for training the weights:
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
trX, trY, teX, teY = mnist.train.images, mnist.train.labels,
mnist.test.images, mnist.test.labels
X = tf.placeholder("float", [None, 784])
Y = tf.placeholder("float", [None, 10])
w = tf.Variable(tf.random_normal([784, 10], stddev=0.01))
b = tf.Variable(tf.zeros([10]))
o = tf.nn.sigmoid(tf.matmul(X, w)+b)
cost= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=o, labels=Y))
train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
predict_op = tf.argmax(o, 1)
sess=tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(100):
for start, end in zip(range(0, len(trX), 256), range(256, len(trX)+1, 256)):
sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})
print(i, np.mean(np.argmax(teY, axis=1) == sess.run(predict_op, feed_dict={X: teX})))
Code for training the image for fixed weights:
#Copy trained weights into W,B and pass them as placeholders to new model
W=sess.run(w)
B=sess.run(b)
X=tf.Variable(tf.random_normal([1, 784], stddev=0.01))
Y=tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])
w=tf.placeholder("float")
b=tf.placeholder("float")
o = tf.nn.sigmoid(tf.matmul(X, w)+b)
cost= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=o, labels=Y))
train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
predict_op = tf.argmax(o, 1)
sess.run(tf.global_variables_initializer())
for i in range(1000):
sess.run(train_op, feed_dict={w:W, b:B})
if i%50==0:
sess.run(cost, feed_dict={w:W, b:B})
print(i, sess.run(predict_op, feed_dict={w:W, b:B}))

You shouldn't call tf.sigmoid on the output of your net. softmax_cross_entropy_with_logits assumes your inputs are logits, i.e. unconstrained real numbers. Using
o = tf.matmul(X, w)+b
increases your accuracy to 92.8%.
With this modification, your second training works. The cost reaches 0 although the resulting image is anything but appealing.

Tensorflow shuffle batch fraction unexpected behavior

I am training a convolutional neural network and I got some unexpected behavior with the shuffle_batch fraction summary, or maybe I just do not understand it. Can someone pls explain it? The difference between those two graphs is that I exchanged the loss function.
With this loss function I get the line at 0.0
loss = tf.nn.l2_loss(expected_labels-labels)
While this one gives me a constant 1.0 (after hitting 1.0 the first time)
loss = tf.reduce_mean(tf.square(expected_labels - labels))
Can the change of loss function really cause that change? I am not sure what this means.
EDIT: Code as requested
The first part is for setting up the batching and the big picture.
filename_queue = tf.train.string_input_producer(filenames,
num_epochs=None)
label, image = read_and_decode_single_example(filename_queue=filename_queue)
image = tf.image.decode_jpeg(image.values[0], channels=3)
jpeg = tf.cast(image, tf.float32) / 255.
jpeg.set_shape([66,200,3])
images_batch, labels_batch = tf.train.shuffle_batch(
[jpeg, label], batch_size= FLAGS.batch_size,
num_threads=8,
capacity=60000,
min_after_dequeue=10000)
images_placeholder, labels_placeholder = placeholder_inputs(
FLAGS.batch_size)
label_estimations, W1_conv, h1_conv, current_images = e2e.inference(images_placeholder)
# Add to the Graph the Ops for loss calculation.
loss = e2e.loss(label_estimations, labels_placeholder)
# Decay once per epoch, using an exponential schedule starting at 0.01.
# Add to the Graph the Ops that calculate and apply gradients.
train_op = e2e.training(loss, FLAGS.learning_rate, FLAGS.batch_size)
Here come the methods for inference loss and train
def inference(images):
with tf.name_scope('conv1'):
W_conv1 = tf.Variable(tf.truncated_normal([5, 5, 3, FEATURE_MAPS_C1], stddev=STDDEV))
b_conv1 = tf.Variable(tf.constant(BIAS_INIT, shape=[FEATURE_MAPS_C1]))
h_conv1 = tf.nn.bias_add(
tf.nn.conv2d(images, W_conv1, strides=[1, 2, 2, 1], padding='VALID'), b_conv1)
with tf.name_scope('conv2'):
W_conv2 = tf.Variable(tf.truncated_normal([5, 5, FEATURE_MAPS_C1, 36], stddev=STDDEV))
b_conv2 = tf.Variable(tf.constant(BIAS_INIT, shape=[36]))
h_conv2 = tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding='VALID') + b_conv2
with tf.name_scope('conv3'):
W_conv3 = tf.Variable(tf.truncated_normal([5, 5, 36, 48], stddev=STDDEV))
b_conv3 = tf.Variable(tf.constant(BIAS_INIT, shape=[48]))
h_conv3 = tf.nn.conv2d(h_conv2, W_conv3, strides=[1, 2, 2, 1], padding='VALID') + b_conv3
with tf.name_scope('conv4'):
W_conv4 = tf.Variable(tf.truncated_normal([3, 3, 48, 64], stddev=STDDEV))
b_conv4 = tf.Variable(tf.constant(BIAS_INIT, shape=[64]))
h_conv4 = tf.nn.conv2d(h_conv3, W_conv4, strides=[1, 1, 1, 1], padding='VALID') + b_conv4
with tf.name_scope('conv5'):
W_conv5 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=STDDEV))
b_conv5 = tf.Variable(tf.constant(BIAS_INIT, shape=[64]))
h_conv5 = tf.nn.conv2d(h_conv4, W_conv5, strides=[1, 1, 1, 1], padding='VALID') + b_conv5
h_conv5_flat = tf.reshape(h_conv5, [-1, 1 * 18 * 64])
with tf.name_scope('fc1'):
W_fc1 = tf.Variable(tf.truncated_normal([1 * 18 * 64, 100], stddev=STDDEV))
b_fc1 = tf.Variable(tf.constant(BIAS_INIT, shape=[100]))
h_fc1 = tf.matmul(h_conv5_flat, W_fc1) + b_fc1
with tf.name_scope('fc2'):
W_fc2 = tf.Variable(tf.truncated_normal([100, 50], stddev=STDDEV))
b_fc2 = tf.Variable(tf.constant(BIAS_INIT, shape=[50]))
h_fc2 = tf.matmul(h_fc1, W_fc2) + b_fc2
with tf.name_scope('fc3'):
W_fc3 = tf.Variable(tf.truncated_normal([50, 10], stddev=STDDEV))
b_fc3 = tf.Variable(tf.constant(BIAS_INIT, shape=[10]))
h_fc3 = tf.matmul(h_fc2, W_fc3) + b_fc3
with tf.name_scope('fc4'):
W_fc4 = tf.Variable(tf.truncated_normal([10, 1], stddev=STDDEV))
b_fc4 = tf.Variable(tf.constant(BIAS_INIT, shape=[1]))
h_fc4 = tf.matmul(h_fc3, W_fc4) + b_fc4
return h_fc4
Here is the loss function, using l2 causes the issue.
def loss(label_estimations, labels):
n_labels = tf.reshape(label_estimations, [-1])
# Here are the two loss functions
#loss = tf.reduce_mean(tf.square(n_labels - labels))
loss = tf.nn.l2_loss(n_labels-labels)
return loss
Train method:
def training(loss, learning_rate, batch_size):
global_step = tf.Variable(0, name='global_step', trainable=False)
tf.scalar_summary('learning_rate',learning_rate)
tf.scalar_summary('Loss ('+loss.op.name+')', loss)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
Plot for tf.reduce_sum(tf.square(n_labels - labels)/2)

As mentioned in TensorFlow's original guide https://www.tensorflow.org/programmers_guide/reading_data
How many threads do you need? the tf.train.shuffle_batch* functions add a summary to the graph that indicates how full the example queue is. If you have enough reading threads, that summary will stay above zero. You can view your summaries as training progresses using TensorBoard.
It seems better if the queue is never empty, i.e. the "fraction_full" stays non-zero. If not, you should allocate more threads to queue_runner

The only difference between your loss and l2 is scaling, thus you might need to play around with your learning rate / other hyperparameters to take this into account.
l2 loss in TF is defined as:
1/2 SUM_i^N (pred(x_i) - y_i)^2
while your cost is
1/N SUM_i^N (pred(x_i) - y_i)^2
Of course since you are using stochastic gradient approach, efficienty you are using an approximator of form
1/2 SUM_{(x_i, y_i) in batch} (pred(x_i) - y_i)^2 # l2
1/#batch SUM_{(x_i, y_i) in batch} (pred(x_i) - y_i)^2 # you
Thus you would have to multiply your cost by batch_size / 2 to get the original cost. Typically this is not a problem, but sometimes wrong scaling can put you in very degenerated parts of the error surface, and the optimizer will simply fail (especially such aggressive one like Adam).
Side note - you are aware that your model is a deep linear model? You do not have any non-linearities in the model. This is very specific network.

Cost-sensitive learning in Tensorflow

I am trying to set up a cost-sensitive binary classification learning in TensorFlow, which would put different penalties on false positives and false negatives. Does anyone know how to create a loss function from a set of penalty weights $(w_1, w_2, w_3, w_4)$ for (true positive, false positive, false negative, true negative).
I went over the standard cost functions offered, but can't figure out how to combine them to get something similar to the above.

Following #Cauchyzhou's answer, if you have the logits, and the sparse labels as well as a cost_matrix whose shape is [L, L], where L is the number of unique labels, you can simply use the function below to calculate the loss
def sparse_cost_sensitive_loss (logits, labels, cost_matrix):
batch_cost_matrix = tf.nn.embedding_lookup(cost_matrix, labels)
eps = 1e-6
probability = tf.clip_by_value(tf.nn.softmax(logits), eps, 1-eps)
cost_values = tf.log(1-probability)*batch_cost_matrix
loss = tf.reduce_mean(-tf.reduce_sum(cost_values, axis=1))
return loss

I am not aware of anyone who has built a cost sensitive neural network classifier but Alejandro Correa Bahnsen has published academic papers for cost sensitive logistic regression and cost sensitive decision trees and a very well documented python cost sensitive classification library named CostCla. CostCla is pretty easy to use if you are familiar with scikit-learn.
You should be able to use the Bayes minimum risk model in the library to minimize the cost of your neural network since it fits a cost model to output prediction probabilities of any classifier.
Note that CostCla is intended to work with potentially different costs for each sample. You give it a cost matrix for your training and test samples. However, you can just make all the rows in the cost matrix the same if that applies to your problem.
Here are a couple of additional academic papers on the subject:
The Foundations of Cost-Sensitive Learning
Optimal ROC Curve for a Combination of Classifiers

cost_matrix:
[[0,1,100],
[1,0,1],
[1,20,0]]
label:
[1,2]
y*:
[[0,1,0],
[0,0,1]]
y(prediction):
[[0.2,0.3,0.5],
[0.1,0.2,0.7]]
label,cost_matrix-->cost_embedding:
[[1,0,1],
[1,20,0]]
It obvious 0.3 in [0.2,0.3,0.5] refers to right lable probility of [0,1,0], so it should not contibute to loss.
0.7 in [0.1,0.2,0.7] is the same. In other words, the pos with value 1 in y* not contibute to loss.
So I have (1-y*):
[[1,0,1],
[1,1,0]]
Then the entropy is target*log(predict) + (1-target) * log(1-predict),and value 0 in y*,should use (1-target)*log(1-predict), so I use (1-predict) said (1-y)
1-y:
[[0.8,*0.7*,0.5],
[0.9,0.8,*0.3*]]
(italic num is useless)
the custom loss is
[[1,0,1], [1,20,0]] * log([[0.8,0.7,0.5],[0.9,0.8,0.3]]) *
[[1,0,1],[1,1,0]]
and you can see the (1-y*) can be drop here
so the loss is -tf.reduce_mean(cost_embedding*log(1-y))
,to make it applicable , should be:
-tf.reduce_mean(cost_embedding*log(tf.clip((1-y),1e-10)))
the demo is below
import tensorflow as tf
import numpy as np
hidden_units = 50
num_class = 3
class Model():
def __init__(self,name_scope,is_custom):
self.name_scope = name_scope
self.is_custom = is_custom
self.input_x = tf.placeholder(tf.float32,[None,hidden_units])
self.input_y = tf.placeholder(tf.int32,[None])
self.instantiate_weights()
self.logits = self.inference()
self.predictions = tf.argmax(self.logits,axis=1)
self.losses,self.train_op = self.opitmizer()
def instantiate_weights(self):
with tf.variable_scope(self.name_scope + 'FC'):
self.W = tf.get_variable('W',[hidden_units,num_class])
self.b = tf.get_variable('b',[num_class])
self.cost_matrix = tf.constant(
np.array([[0,1,100],[1,0,100],[20,5,0]]),
dtype = tf.float32
)
def inference(self):
return tf.matmul(self.input_x,self.W) + self.b
def opitmizer(self):
if not self.is_custom:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits\
(labels=self.input_y,logits=self.logits)
else:
batch_cost_matrix = tf.nn.embedding_lookup(
self.cost_matrix,self.input_y
)
loss = - tf.log(1 - tf.nn.softmax(self.logits))\
* batch_cost_matrix
train_op = tf.train.AdamOptimizer().minimize(loss)
return loss,train_op
import random
batch_size = 128
norm_model = Model('norm',False)
custom_model = Model('cost',True)
split_point = int(0.9 * dataset_size)
train_set = datasets[:split_point]
test_set = datasets[split_point:]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(100):
batch_index = random.sample(range(split_point),batch_size)
train_batch = train_set[batch_index]
train_labels = lables[batch_index]
_,eval_predict,eval_loss = sess.run([norm_model.train_op,
norm_model.predictions,norm_model.losses],
feed_dict={
norm_model.input_x:train_batch,
norm_model.input_y:train_labels
})
_,eval_predict1,eval_loss1 = sess.run([custom_model.train_op,
custom_model.predictions,custom_model.losses],
feed_dict={
custom_model.input_x:train_batch,
custom_model.input_y:train_labels
})
# print '默认',eval_predict,'\n自定义',eval_predict1
print np.sum(((eval_predict == train_labels)==True).astype(np.int)),\
np.sum(((eval_predict1 == train_labels)==True).astype(np.int))
if i%10 == 0:
print '默认测试',sess.run(norm_model.predictions,
feed_dict={
norm_model.input_x:test_set,
norm_model.input_y:lables[split_point:]
})
print '自定义测试',sess.run(custom_model.predictions,
feed_dict={
custom_model.input_x:test_set,
custom_model.input_y:lables[split_point:]
})

Here is other solution where you can use any tensorflow loss and make it cost sensitive using kwarg weights ... note that unlike most cases here you need to use cost as '1' instead of '0' when you want to keep loss as it is ...
Some advantages of this approach are:
it extends tf.losses.Loss and satisfies the call api
reduction kwarg of the original loss remains functional and the behaviour is propagated to CostSensitiveLoss
you can also pass your own extra weights to new loss instances. Note that internally generated weights are used by wrapped self.loss
import numpy as np
from keras.api._v2 import keras as tk
import tensorflow as tf
from keras.utils import losses_utils
import typing as t
class CostSensitiveLoss(tk.losses.Loss):
def __init__(
self,
cost_matrix: t.List, loss: tk.losses.Loss,
):
super().__init__(reduction=loss.reduction, name=loss.name)
self.loss = loss
self.cost_matrix = cost_matrix
self._cost_matrix = tf.constant(cost_matrix, dtype=tf.float32)
#classmethod
def from_config(cls, config):
config['loss'] = tk.losses.deserialize(config['loss'])
return cls(**config)
def get_config(self):
return {
'cost_matrix': self.cost_matrix,
'loss': tk.losses.serialize(self.loss),
'reduction': self.reduction, 'name': self.name
}
def call(self, y_true, y_pred):
# if y_true is one hot encoded then get integer indices
if y_true.ndim == 1:
y_true_index = y_true
elif y_true.ndim == 2:
y_true_index = tf.argmax(y_true, axis=1)
else:
raise Exception(f"`y_true.ndim` {y_true.ndim} not supported")
# get cost for batch
cost_for_batch = tf.nn.embedding_lookup(self._cost_matrix, y_true_index)
cost_for_batch *= y_pred
cost_for_batch = tf.reduce_sum(cost_for_batch, axis=1)
# get loss
return self.loss(y_true, y_pred, cost_for_batch)
if __name__ == '__main__':
# for debug purpose I have kept 'none' you can
# safely use other options like 'sum', 'auto'
_loss = tk.losses.MeanAbsoluteError(reduction='none')
# some cost matrices the first cost matrix is the case when you are
# not using cost sensitive weights
_cs_loss_1 = CostSensitiveLoss(
cost_matrix=[[1, 1, 1], [1, 1, 1], [1, 1, 1], ],
loss=_loss
)
_cs_loss_2 = CostSensitiveLoss(
cost_matrix=[[1, 2, 2], [4, 1, 4], [8, 8, 1], ],
loss=_loss
)
_cs_loss_3 = CostSensitiveLoss(
cost_matrix=[[1, 4, 8], [2, 1, 8], [2, 4, 1], ],
loss=_loss
)
_y_true = np.asarray(
[
[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
]
)
_y_pred = np.asarray(
[
[0.8, 0.1, 0.1],
[0.1, 0.8, 0.1],
[0.1, 0.1, 0.8],
[0.1, 0.8, 0.1],
[0.1, 0.1, 0.8],
[0.8, 0.1, 0.1],
[0.1, 0.1, 0.8],
[0.8, 0.1, 0.1],
[0.1, 0.8, 0.1],
]
)
print("loss ........................")
print(_loss(_y_true, _y_pred).numpy())
print("cs_loss_1 ...................")
print(_cs_loss_1(_y_true, _y_pred).numpy())
print("cs_loss_2 ...................")
print(_cs_loss_2(_y_true, _y_pred).numpy())
print("cs_loss_3 ...................")
print(_cs_loss_3(_y_true, _y_pred).numpy())

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Why aren't my weights being updated? - machine-learning

Related

Tensorflow multi-GPU MNIST classifier: low accuracy

Can't get higher accuracy than 50%~ on CIFAR10 dataset

Tensorflow: Visualizing trained weights for linear classifier on MNIST dataset

Tensorflow shuffle batch fraction unexpected behavior

Cost-sensitive learning in Tensorflow

Categories

Resources