Compute accuracy of TIMIT data set on an RNN model - machine-learning

So I am trying to figure out how to compute the accuracy of an RNN model.
I have a batch size of 192, and every frame (TIMIT frame) is of the size of 490*129.
currently, I compute my accuracy as follows:
for batch_x, batch_y, len_batch in val_loader:
batch_x, batch_y, len_batch = batch_x.to(device), batch_y.to(device), len_batch.to(device)
logits = model(batch_x)
loss_val = model.loss(logits, batch_y, len_batch)
preds1 = logits.argmax(dim=-1)
temp1 = batch_y.argmax(dim=-1)
acc1 = preds1.squeeze().eq(temp1).sum().item()
acc_rate1 = 100 * acc1 / (192 * 490)
But the accuracy that I receive seems wrong.
My train:
model.train()
print('number of epochs: {}'.format(args.epochs))
best_test = 1e7
best_validation = 1e7
losses_diagonal = []
for ep in range(1, 2):
init_time = datetime.now()
processed = 0
step = 1
for batch_idx, (batch_x, batch_y, len_batch) in enumerate(train_loader):
batch_x, batch_y, len_batch = batch_x.to(device), batch_y.to(device), len_batch.to(device)
opt.zero_grad()
logits = model(batch_x)
loss = model.loss(logits, batch_y, len_batch)
loss.backward()
if args.clip > 0:
nn.utils.clip_grad_norm_(model.parameters(), args.clip)
opt.step()
processed += len(batch_x)
step += 1
losses_diagonal.append(loss)
print(" batch_idx {}\tLoss: {:.2f} ".format(batch_idx, loss))
print("Epoch {}, LR {:.5f} \tLoss: {:.2f} ".format(ep, opt.param_groups[0]['lr'], loss))
My test:
model.eval()
with torch.no_grad():
for batch_x, batch_y, len_batch in test_loader:
batch_x, batch_y, len_batch = batch_x.to(device), batch_y.to(device), len_batch.to(device)
logits = model(batch_x)
loss_test = model.loss(logits, batch_y, len_batch)
for batch_x, batch_y, len_batch in val_loader:
batch_x, batch_y, len_batch = batch_x.to(device), batch_y.to(device), len_batch.to(device)
logits = model(batch_x)
loss_val = model.loss(logits, batch_y, len_batch)
for batch_x, batch_y, len_batch in val_loader:
batch_x, batch_y, len_batch = batch_x.to(device), batch_y.to(device), len_batch.to(device)
logits = model(batch_x)
loss_val = model.loss(logits, batch_y, len_batch)
preds1 = logits.argmax(dim=-1)
temp1 = batch_y.argmax(dim=-1)
acc1 = preds1.squeeze().eq(temp1).sum().item()
acc_rate1 = 100 * acc1 / (192 * 490)
acc_rate1
print('acc = {}, acc_rate = {}'.format(acc1,acc_rate1))
if loss_val < best_validation:
best_validation = loss_val.item()
best_test = loss_test.item()
print()
print("Val: Loss: {:.2f}\tBest: {:.2f}".format(loss_val, best_validation))
print("Test: Loss: {:.2f}\tBest: {:.2f}".format(loss_test, best_test))
print()
model.train()
I would like to get some help with computing the accuracy of my RNN model, and I am willing to publish some more of my code if it will be required.

Related

validation loss is not changing at all (pytorch)

I'm using PyTorch for the first time to train my sentiment analysts model using Bert's pre-trained model.
this is my classifier
class SentimentClassifier2(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier2, self).__init__()
D_in, H, D_out = 768, 200, 3
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.4)
self.classifier = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
nn.Linear(H, D_out)
)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
output = self.drop(pooled_output)
logits = self.classifier(output)
return logits
this is my optimizer/loss function (I'm doing only 20 epochs cause it takes a while to train )
EPOCHS = 20
model2 = SentimentClassifier2(len(class_names))
model2= model2.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=True)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
training & evaluation code
def train_epoch( model, data_loader, loss_fn,optimizer, device, scheduler, n_examples):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
my problem: the loss for the validation samples is not changing at all !!!
epoch1:______________________
Train loss 1.0145157482929346 accuracy 0.4185746994848311
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch2:______________________
Train loss 1.015038197996413 accuracy 0.41871780194619346
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch3:______________________
Train loss 1.014710763787351 accuracy 0.4188609044075558
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch4:______________________
Train loss 1.0139196826735648 accuracy 0.41909940850982635
Val loss 1.002384223589083 accuracy 0.4151087371232354
I don't understand what the problem is ...!
I would be grateful if someone could help me ☹
Maybe you can try to it. following,
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.zero_grad()
optimizer.step()
scheduler.step()

Predicting probabilities in classfier tensorflow

Hey i am pretty new to tensorflow. I am building a classification model basically classifying into 0/1. Is there a way to predict probability of output being 1. Can predict_proba be used over here? Its been widely used in tflearn.dnn but can't find any reference to do it in my case.
def main():
train_x,test_x,train_y,test_y = load_csv_data()
x_size = train_x.shape[1]
y_size = train_y.shape[1]
print(x_size)
print(y_size)
# variables
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])
weights_1 = initialize_weights((x_size, h_size))
weights_2 = initialize_weights((h_size, y_size))
# Forward propagation
y_pred = forward_propagation(X, weights_1, weights_2)
predict = tf.argmax(y_pred, dimension=1)
# Backward propagation
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_pred))
updates_sgd = tf.train.GradientDescentOptimizer(sgd_step).minimize(cost)
# Start tensorflow session
with tf.Session() as sess:
init = tf.global_variables_initializer()
steps = 1
sess.run(init)
x = np.arange(steps)
test_acc = []
train_acc = []
print("Step, train accuracy, test accuracy")
for step in range(steps):
# Train with each example
batch_size = len(train_x)
avg_cost = 0
print(batch_size)
for i in range(len(train_x)):
_, c = sess.run([updates_sgd,cost], feed_dict={X: train_x[i: i + 1], y: train_y[i: i + 1]})
print(c)
avg_cost += c/batch_size
train_accuracy = np.mean(np.argmax(train_y, axis=1) ==
sess.run(predict, feed_dict={X: train_x, y: train_y}))
test_accuracy = np.mean(np.argmax(test_y, axis=1) ==
sess.run(predict, feed_dict={X: test_x, y: test_y}))
print(avg_cost)
print("%d, %.2f%%, %.2f%%"
% (step + 1, 100. * train_accuracy, 100. * test_accuracy))
test_acc.append(100. * test_accuracy)
train_acc.append(100. * train_accuracy)
predict = tf.argmax(y_pred,1)
test_data = load_test_data( )
print(test_data)
pred = predict.eval(feed_dict={X:test_data})
print(pred)
for x in range(0,100):
print(pred[x])
print(np.unique(pred))
main()
Here you take argmax of probabilities:
predict = tf.argmax(y_pred, dimension=1)
If you return simply "y_pred" you should get probabilities.

Huge loss value to NaN on regularization and dropout in a deep neural network

I'm taking the Deep Learning course on Udacity. One of the tasks that is given is to implement regularization and dropout into a multi layer neural network.
After implementation, my minibatch loss in insanely high at step 0, changes to infinity at step 1, and then becomes non existent for the rest of the output
Offset at step 0: 0
Minibatch loss at step 0: 187359330304.000000
Minibatch accuracy: 10.2%
Validation accuracy: 10.0%
Offset at step 1: 128
Minibatch loss at step 1: inf
Minibatch accuracy: 14.1%
Validation accuracy: 10.0%
Offset at step 2: 256
Minibatch loss at step 2: nan
Minibatch accuracy: 7.8%
Validation accuracy: 10.0%
Offset at step 3: 384
Minibatch loss at step 3: nan
Minibatch accuracy: 11.7%
Validation accuracy: 10.0%
Here is all the relevant code. I'm confident it has nothing to do with the way I've done my optimization (since that is taken from the given task) or my
regularization so I'm not sure where else it could be. I've also played around with the number of nodes in the hidden layers (1024 > 300 > 60) but it does the same thing.
Here is my code (excuse the indentation, it's correct in my code):
batch_size = 128
num_nodes_1 = 768
num_nodes_2 = 1024
num_nodes_3 = 512
dropout_value = 0.5
beta = 0.01
graph = tf.Graph()
with graph.as_default():
tf_train_data = tf.placeholder(tf.float32, shape=(batch_size, image_size*image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_data = tf.constant(valid_dataset)
tf_test_data = tf.constant(test_dataset)
def gen_weights_biases(input_size, output_size):
weights = tf.Variable(tf.truncated_normal([input_size, output_size]))
biases = tf.Variable(tf.zeros([output_size]))
return weights, biases
weights_1, biases_1 = gen_weights_biases(image_size*image_size, num_nodes_1)
weights_2, biases_2 = gen_weights_biases(num_nodes_1, num_nodes_2)
weights_3, biases_3 = gen_weights_biases(num_nodes_2, num_nodes_3)
weights_4, biases_4 = gen_weights_biases(num_nodes_3, num_labels)
logits_1 = tf.matmul(tf_train_data, weights_1) + biases_1
h_layer_1 = tf.nn.relu(logits_1)
h_layer_1 = tf.nn.dropout(h_layer_1, dropout_value)
logits_2 = tf.matmul(h_layer_1, weights_2) + biases_2
h_layer_2 = tf.nn.relu(logits_2)
h_layer_2 = tf.nn.dropout(h_layer_2, dropout_value)
logits_3 = tf.matmul(h_layer_2, weights_3) + biases_3
h_layer_3 = tf.nn.relu(logits_3)
h_layer_3 = tf.nn.dropout(h_layer_3, dropout_value)
logits_4 = tf.matmul(h_layer_3, weights_4) + biases_4
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits_4))
regularization = tf.nn.l2_loss(logits_1) + tf.nn.l2_loss(logits_2) + tf.nn.l2_loss(logits_3) + tf.nn.l2_loss(logits_4)
reg_loss = tf.reduce_mean(loss + regularization * beta)
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(0.5, global_step, 750, 0.8)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(reg_loss, global_step=global_step)
train_prediction = tf.nn.softmax(logits_4)
def make_prediction(input_data):
p_logits_1 = tf.matmul(input_data, weights_1) + biases_1
p_layer_1 = tf.nn.relu(p_logits_1)
p_logits_2 = tf.matmul(p_layer_1, weights_2) + biases_2
p_layer_2 = tf.nn.relu(p_logits_2)
p_logits_3 = tf.matmul(p_layer_2, weights_3) + biases_3
p_layer_3 = tf.nn.relu(p_logits_3)
p_logits_4 = tf.matmul(p_layer_3, weights_4) + biases_4
return tf.nn.relu(p_logits_4)
valid_prediction = make_prediction(tf_valid_data)
test_prediction = make_prediction(tf_test_data)
num_steps = 10001
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("Initialized \n")
for step in range(num_steps):
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_train_data:batch_data, tf_train_labels:batch_labels}
_, l, predictions = session.run([optimizer, reg_loss, train_prediction], feed_dict=feed_dict)
if(step % 1 == 0):
print("Offset at step %d: %d" % (step, offset))
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%% \n" % accuracy(valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Why is this happening, and how do I fix it?
The problem was the standard deviation for the weights. I'm not sure why this sorted it out and if someone could explain I'd appreciate it. Anyway the fix was:
def gen_weights_biases(input_size, output_size):
weights = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=math.sqrt(2.0/(input_size))))
biases = tf.Variable(tf.zeros([output_size]))
return weights, biases
The beta rate also had to be lowered to 0.0001

ValueError: could not broadcast input array from shape (2,224,224,3) into shape (2,224,224)

TensorFlow - How to predict with trained model on a different test dataset? I was working on Image segmentation. The prediction output have different dimension and give me hard time. Any help ..
def main(argv=None):
keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")
image = tf.placeholder(tf.float32, shape=[None, IMAGE_SIZE, IMAGE_SIZE, 3], name="input_image")
annotation = tf.placeholder(tf.int32, shape=[None, IMAGE_SIZE, IMAGE_SIZE, 1], name="annotation")
pred_annotation, logits = inference(image, keep_probability)
tf.summary.image("input_image", image, max_outputs=2)
tf.summary.image("ground_truth", tf.cast(annotation, tf.uint8), max_outputs=2)
tf.summary.image("pred_annotation", tf.cast(pred_annotation, tf.uint8), max_outputs=2)
loss = tf.reduce_mean((tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=tf.squeeze(annotation, squeeze_dims=[3]),
name="entropy")))
tf.summary.scalar("entropy", loss)
trainable_var = tf.trainable_variables()
if FLAGS.debug:
for var in trainable_var:
utils.add_to_regularization_and_summary(var)
train_op = train(loss, trainable_var)
print("Setting up summary op...")
summary_op = tf.summary.merge_all()
print("Setting up image reader...")
train_records, valid_records, test_records= scene_parsing.read_dataset(FLAGS.data_dir)
print(len(train_records))
print(len(valid_records))
print(len(test_records))
print("Setting up dataset reader")
image_options = {'resize': True, 'resize_size': IMAGE_SIZE}
if FLAGS.mode == 'train':
train_dataset_reader = dataset.BatchDatset(train_records, image_options)
validation_dataset_reader = dataset.BatchDatset(valid_records, image_options)
test_dataset_reader = dataset.BatchDatset(test_records, image_options)
sess = tf.Session()
print("Setting up Saver...")
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(FLAGS.logs_dir, sess.graph)
sess.run(tf.global_variables_initializer())
ckpt = tf.train.get_checkpoint_state(FLAGS.logs_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print("Model restored...")
if FLAGS.mode == "train":
for itr in xrange(MAX_ITERATION):
train_images, train_annotations = train_dataset_reader.next_batch(FLAGS.batch_size)
feed_dict = {image: train_images, annotation: train_annotations, keep_probability: 0.85}
sess.run(train_op, feed_dict=feed_dict)
if itr % 10 == 0:
train_loss, summary_str = sess.run([loss, summary_op], feed_dict=feed_dict)
print("Step: %d, Train_loss:%g" % (itr, train_loss))
summary_writer.add_summary(summary_str, itr)
if itr % 100 == 0:
valid_images, valid_annotations = validation_dataset_reader.next_batch(FLAGS.batch_size)
valid_loss = sess.run(loss, feed_dict={image: valid_images, annotation: valid_annotations,
keep_probability: 1.0})
print("%s ---> Validation_loss: %g" % (datetime.datetime.now(), valid_loss))
saver.save(sess, FLAGS.logs_dir + "model.ckpt", itr)
elif FLAGS.mode == "predict":
predict_dataset_reader = dataset.BatchDatset(train_records, image_options)
test_images = predict_dataset_reader.get_random_batch(FLAGS.batch_size)
pred = sess.run(pred_annotation, feed_dict={image: test_images,
keep_probability: 1.0})
#test_annotations = np.squeeze(test_annotations, axis=3)
pred = np.squeeze(pred, axis=3)
for itr in range(FLAGS.batch_size):
utils.save_image(test_images[itr].astype(np.uint8), FLAGS.logs_dir, name="inp_" + str(20+itr))
utils.save_image(pred[itr].astype(np.uint8), FLAGS.logs_dir, name="pred_" + str(20+itr))
print("Saved image: %d" % itr)
if __name__ == "__main__":
tf.app.run()
The error is on "predict" for "train" works perfectly.

nan loss when training a deep neural network in tensorflow tutorial

I am trying to train a neural network with more than 1 hidden layer on the notMNIST. When I have one hidden layer it works fine, but when I add multiple hidden layers I start getting nan for the loss. Here is the code i am using
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
batch_size = 128
num_hidden = 1024
num_hidden2 = 300
num_hidden3 = 50
SEED = 1234567
keep_prob = 0.5
graph1 = tf.Graph()
with graph1.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden]))
biases1 = tf.Variable(tf.zeros([num_hidden]))
weights2 = tf.Variable(tf.truncated_normal([num_hidden, num_hidden2]))
biases2 = tf.Variable(tf.zeros([num_hidden2]))
weights3 = tf.Variable(tf.truncated_normal([num_hidden2, num_hidden3]))
biases3 = tf.Variable(tf.zeros([num_hidden3]))
weights4 = tf.Variable(tf.truncated_normal([num_hidden3, num_labels]))
biases4 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
l1 = tf.matmul(tf_train_dataset, weights1) + biases1
h1 = tf.nn.relu(l1)
h1 = tf.nn.dropout(h1, 0.5, seed=SEED)
l2 = tf.matmul(h1, weights2) + biases2
h2 = tf.nn.relu(l2)
h2 = tf.nn.dropout(h2, 0.5, seed=SEED)
l3 = tf.matmul(h2, weights3) + biases3
h3 = tf.nn.relu(l3)
h3 = tf.nn.dropout(h3, 0.5, seed=SEED)
logits = tf.matmul(h3, weights4) + biases4
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# L2 regularization for the fully connected parameters.
regularizers = (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(biases1) +
tf.nn.l2_loss(weights2) + tf.nn.l2_loss(biases2) +
tf.nn.l2_loss(weights3) + tf.nn.l2_loss(biases3) +
tf.nn.l2_loss(weights4) + tf.nn.l2_loss(biases4))
# Add the regularization term to the loss.
loss += 5e-4 * regularizers
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
v_l1 = tf.matmul(tf_valid_dataset, weights1) + biases1
v_h1 = tf.nn.relu(v_l1)
v_l2 = tf.matmul(v_h1, weights2) + biases2
v_h2 = tf.nn.relu(v_l2)
v_l3 = tf.matmul(v_h2, weights3) + biases3
v_h3 = tf.nn.relu(v_l3)
v_logits = tf.matmul(v_h3, weights4) + biases4
valid_prediction = tf.nn.softmax(v_logits)
t_l1 = tf.matmul(tf_test_dataset, weights1) + biases1
t_h1 = tf.nn.relu(t_l1)
t_l2 = tf.matmul(t_h1, weights2) + biases2
t_h2 = tf.nn.relu(t_l2)
t_l3 = tf.matmul(t_h2, weights3) + biases3
t_h3 = tf.nn.relu(t_l3)
t_logits = tf.matmul(t_h3, weights4) + biases4
test_prediction = tf.nn.softmax(t_logits)
num_steps = 3001
with tf.Session(graph=graph1) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
This is the output I get
Initialized
Minibatch loss at step 0: 48759.078125
Minibatch accuracy: 10.2%
Validation accuracy: 10.0%
Minibatch loss at step 500: nan
Minibatch accuracy: 9.4%
Validation accuracy: 10.0%
Minibatch loss at step 1000: nan
Minibatch accuracy: 8.6%
Validation accuracy: 10.0%
Minibatch loss at step 1500: nan
Minibatch accuracy: 11.7%
Validation accuracy: 10.0%
Minibatch loss at step 2000: nan
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibatch loss at step 2500: nan
Minibatch accuracy: 10.2%
Validation accuracy: 10.0%
Minibatch loss at step 3000: nan
Minibatch accuracy: 7.8%
Validation accuracy: 10.0%
Test accuracy: 10.0%
try lowering the standard deviation of the weights. The default is set to 1. It worked for me.

Resources