Classification using deep nets on CIFAR-10 dataset - machine-learning

I am trying to build a classifier using deep learning techniques and used cifar-10 dataset to build a classifier.I have tried to build a classifier with 1024 hidden nodes. Each image is of 32*32*3(R-G-B) size.I have loaded data from only 3/5 files of dataset as my computer has low processing power.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
import sys
import tarfile
import random
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from sklearn.preprocessing import MultiLabelBinarizer
folder='/home/cifar-10-batches-py/'
training_data=np.ndarray((30000,3072),dtype=np.float32)
training_labels=np.ndarray(30000,dtype=np.int32)
testing_data=np.ndarray((10000,3072),dtype=np.float32)
testing_labels=np.ndarray(10000,dtype=np.int32)
no_of_files=3
begin=0
end=10000
for i in range(no_of_files):
with open(folder+"data_batch_"+str(i+1),'rb') as f:
s=pickle.load(f,encoding='bytes')
training_data[begin:end]=s[b'data']
training_labels[begin:end]=s[b'labels']
begin=begin+10000
end=end+10000
test_path='/home/cifar-10-batches-py/test_batch'
with open(test_path,'rb') as d:
s9=pickle.load(d,encoding='bytes')
tdata=s9[b'data']
testing_data=tdata
tlabels=s9[b'labels']
testing_labels=tlabels
test_data=np.ndarray((5000,3072),dtype=np.float32)
test_labels=np.ndarray(5000,dtype=np.int32)
valid_data=np.ndarray((5000,3072),dtype=np.float32)
valid_labels=np.ndarray(5000,dtype=np.int32)
valid_data[:,:]=testing_data[:5000, :]
valid_labels[:]=testing_labels[:5000]
test_data[:,:]=testing_data[5000:, :]
test_labels[:]=testing_labels[5000:]
onehot_training_labels=np.eye(10)[training_labels.astype(int)]
onehot_test_labels=np.eye(10)[test_labels.astype(int)]
onehot_valid_labels=np.eye(10)[valid_labels.astype(int)]
image_size=32
num_labels=10
train_subset = 10000
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
batch_size = 128
relu_count = 1024 #hidden nodes count
graph = tf.Graph()
with graph.as_default():
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size*3))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_data)
tf_test_dataset = tf.constant(test_data)
beta_regul = tf.placeholder(tf.float32)
weights1 = tf.Variable(
tf.truncated_normal([image_size * image_size*3, relu_count]))
biases1 = tf.Variable(tf.zeros([relu_count]))
weights2 = tf.Variable(
tf.truncated_normal([relu_count, num_labels]))
biases2 = tf.Variable(tf.zeros([num_labels]))
preds = tf.matmul( tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1), weights2) + biases2
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=tf_train_labels))+ \
beta_regul * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2))
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
train_prediction = tf.nn.softmax(preds)
lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
valid_prediction = tf.nn.softmax(tf.matmul(lay1_valid, weights2) + biases2)
lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
test_prediction = tf.nn.softmax(tf.matmul(lay1_test, weights2) + biases2)
num_steps = 5000
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
offset = (step * batch_size) % (onehot_training_labels.shape[0] - batch_size)
batch_data = training_data[offset:(offset + batch_size), :]
batch_labels = onehot_training_labels[offset:(offset + batch_size), :]
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels,beta_regul : 1e-3}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), onehot_valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), onehot_test_labels))
the output for this code is:
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Minibatch loss at step 0: 117783.914062
Minibatch accuracy: 14.8%
Validation accuracy: 10.2%
Minibatch loss at step 500: 3632989892247552.000000
Minibatch accuracy: 12.5%
Validation accuracy: 10.1%
Minibatch loss at step 1000: 2203224941527040.000000
Minibatch accuracy: 6.2%
Validation accuracy: 9.9%
Minibatch loss at step 1500: 1336172110413824.000000
Minibatch accuracy: 10.9%
Validation accuracy: 9.8%
Minibatch loss at step 2000: 810328996708352.000000
Minibatch accuracy: 8.6%
Validation accuracy: 10.1%
Minibatch loss at step 2500: 491423044468736.000000
Minibatch accuracy: 9.4%
Validation accuracy: 10.1%
Minibatch loss at step 3000: 298025566076928.000000
Minibatch accuracy: 12.5%
Validation accuracy: 9.8%
Minibatch loss at step 3500: 180741635833856.000000
Minibatch accuracy: 10.9%
Validation accuracy: 9.8%
Minibatch loss at step 4000: 109611013111808.000000
Minibatch accuracy: 15.6%
Validation accuracy: 10.1%
Minibatch loss at step 4500: 66473376612352.000000
Minibatch accuracy: 3.9%
Validation accuracy: 9.9%
Test accuracy: 10.2%
where am i doing it wrong? I see the accuracy is very low.

As far as I can see, you are building a simple 2-layer FNN with Tensorflow. While it is okay, you won't get a really high accuracy. But if you try, you need to carefully tune hyperparameters - learning rate, regularization strength, decay rate, number of neurons in hidden layers.
You are using not all data, so it will surely descrease the quality of predictions. It could still work, but you should check classes distribution in train, val and test sets. It is possible that some classes have too little values in one of datasets. You need at least to stratify your selection.
Are you sure you have a deep understanding of deep learning? It could be a good idea to try cs231n course.

Related

Resuming Training PyTorch

I'm attempting to save and load best model through torch, where I've defined my training function as follows:
def train_model(model, train_loader, test_loader, device, learning_rate=1e-1, num_epochs=200):
# The training configurations were not carefully selected.
criterion = nn.CrossEntropyLoss()
model.to(device)
# It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=500)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[65, 75], gamma=0.75, last_epoch=-1)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
print("Epoch: {:02d} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(-1, eval_loss, eval_accuracy))
load_model = input('Load a model?')
for epoch in range(num_epochs):
if epoch//2 == 0:
write_checkpoint(model=model, epoch=epoch, scheduler=scheduler, optimizer=optimizer)
model, optimizer, epoch, scheduler = load_checkpoint(model=model, scheduler=scheduler, optimizer=optimizer)
for state in optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.to(device)
# Training
model.train()
running_loss = 0
running_corrects = 0
for inputs, labels in train_loader:
inputs = torch.FloatTensor(inputs)
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
train_loss = running_loss / len(train_loader.dataset)
train_accuracy = running_corrects / len(train_loader.dataset)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
# Set learning rate scheduler
scheduler.step()
print("Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(epoch, train_loss, train_accuracy, eval_loss, eval_accuracy))
return model
Where I'd like to be able to load a model, and start training from the epoch where model was saved.
So far I have methods to save model, optimizer,scheduler states and the epoch via
def write_checkpoint(model, optimizer, epoch, scheduler):
state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }
filename = '/content/model_'
torch.save(state, filename + f'CP_epoch{epoch + 1}.pth')
def load_checkpoint(model, optimizer, scheduler, filename='/content/checkpoint.pth'):
# Note: Input model & optimizer should be pre-defined. This routine only updates their states.
start_epoch = 0
if os.path.isfile(filename):
print("=> loading checkpoint '{}'".format(filename))
checkpoint = torch.load(filename)
start_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
scheduler = checkpoint['scheduler']
print("=> loaded checkpoint '{}' (epoch {})"
.format(filename, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(filename))
return model, optimizer, start_epoch, scheduler
But I can't seem to come up with the logic of how I'd update the epoch to start at the correct one. Looking for hints or ideas on how to implement just that.
If I understand correctly you trying to resume training from last progress with correct epoch number.
Before calling train_model load the checkpoint values including start_epoch. Then use start_epoch as loop starting point,
for epoch in range(start_epoch, num_epochs):

Huge loss value to NaN on regularization and dropout in a deep neural network

I'm taking the Deep Learning course on Udacity. One of the tasks that is given is to implement regularization and dropout into a multi layer neural network.
After implementation, my minibatch loss in insanely high at step 0, changes to infinity at step 1, and then becomes non existent for the rest of the output
Offset at step 0: 0
Minibatch loss at step 0: 187359330304.000000
Minibatch accuracy: 10.2%
Validation accuracy: 10.0%
Offset at step 1: 128
Minibatch loss at step 1: inf
Minibatch accuracy: 14.1%
Validation accuracy: 10.0%
Offset at step 2: 256
Minibatch loss at step 2: nan
Minibatch accuracy: 7.8%
Validation accuracy: 10.0%
Offset at step 3: 384
Minibatch loss at step 3: nan
Minibatch accuracy: 11.7%
Validation accuracy: 10.0%
Here is all the relevant code. I'm confident it has nothing to do with the way I've done my optimization (since that is taken from the given task) or my
regularization so I'm not sure where else it could be. I've also played around with the number of nodes in the hidden layers (1024 > 300 > 60) but it does the same thing.
Here is my code (excuse the indentation, it's correct in my code):
batch_size = 128
num_nodes_1 = 768
num_nodes_2 = 1024
num_nodes_3 = 512
dropout_value = 0.5
beta = 0.01
graph = tf.Graph()
with graph.as_default():
tf_train_data = tf.placeholder(tf.float32, shape=(batch_size, image_size*image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_data = tf.constant(valid_dataset)
tf_test_data = tf.constant(test_dataset)
def gen_weights_biases(input_size, output_size):
weights = tf.Variable(tf.truncated_normal([input_size, output_size]))
biases = tf.Variable(tf.zeros([output_size]))
return weights, biases
weights_1, biases_1 = gen_weights_biases(image_size*image_size, num_nodes_1)
weights_2, biases_2 = gen_weights_biases(num_nodes_1, num_nodes_2)
weights_3, biases_3 = gen_weights_biases(num_nodes_2, num_nodes_3)
weights_4, biases_4 = gen_weights_biases(num_nodes_3, num_labels)
logits_1 = tf.matmul(tf_train_data, weights_1) + biases_1
h_layer_1 = tf.nn.relu(logits_1)
h_layer_1 = tf.nn.dropout(h_layer_1, dropout_value)
logits_2 = tf.matmul(h_layer_1, weights_2) + biases_2
h_layer_2 = tf.nn.relu(logits_2)
h_layer_2 = tf.nn.dropout(h_layer_2, dropout_value)
logits_3 = tf.matmul(h_layer_2, weights_3) + biases_3
h_layer_3 = tf.nn.relu(logits_3)
h_layer_3 = tf.nn.dropout(h_layer_3, dropout_value)
logits_4 = tf.matmul(h_layer_3, weights_4) + biases_4
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits_4))
regularization = tf.nn.l2_loss(logits_1) + tf.nn.l2_loss(logits_2) + tf.nn.l2_loss(logits_3) + tf.nn.l2_loss(logits_4)
reg_loss = tf.reduce_mean(loss + regularization * beta)
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(0.5, global_step, 750, 0.8)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(reg_loss, global_step=global_step)
train_prediction = tf.nn.softmax(logits_4)
def make_prediction(input_data):
p_logits_1 = tf.matmul(input_data, weights_1) + biases_1
p_layer_1 = tf.nn.relu(p_logits_1)
p_logits_2 = tf.matmul(p_layer_1, weights_2) + biases_2
p_layer_2 = tf.nn.relu(p_logits_2)
p_logits_3 = tf.matmul(p_layer_2, weights_3) + biases_3
p_layer_3 = tf.nn.relu(p_logits_3)
p_logits_4 = tf.matmul(p_layer_3, weights_4) + biases_4
return tf.nn.relu(p_logits_4)
valid_prediction = make_prediction(tf_valid_data)
test_prediction = make_prediction(tf_test_data)
num_steps = 10001
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("Initialized \n")
for step in range(num_steps):
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_train_data:batch_data, tf_train_labels:batch_labels}
_, l, predictions = session.run([optimizer, reg_loss, train_prediction], feed_dict=feed_dict)
if(step % 1 == 0):
print("Offset at step %d: %d" % (step, offset))
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%% \n" % accuracy(valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Why is this happening, and how do I fix it?
The problem was the standard deviation for the weights. I'm not sure why this sorted it out and if someone could explain I'd appreciate it. Anyway the fix was:
def gen_weights_biases(input_size, output_size):
weights = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=math.sqrt(2.0/(input_size))))
biases = tf.Variable(tf.zeros([output_size]))
return weights, biases
The beta rate also had to be lowered to 0.0001

Why is my 1-hidden layer neural network with ReLUs not getting more than 18% accuracy on the notMNIST dataset?

I'm trying to implement a 1-hidden layer neural network with rectified linear units and 1024 hidden nodes using Tensorflow.
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
batch_size = 128
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights1 = tf.Variable(
tf.truncated_normal([image_size * image_size, 1024]))
biases1 = tf.Variable(tf.zeros([1024]))
weights2 = tf.Variable(
tf.truncated_normal([1024, num_labels]))
biases2 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
logits = tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1), weights2) + biases2
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(
tf.nn.relu(
tf.matmul(tf_valid_dataset, weights1)
+ biases1),
weights2) + biases2)
test_prediction = tf.nn.softmax(
tf.matmul(
tf.nn.relu(
tf.matmul(tf_test_dataset, weights1)
+ biases1),
weights2) + biases2)
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Here is the output I get:
Initialized
Minibatch loss at step 0: 208.975021
Minibatch accuracy: 11.7%
Validation accuracy: 10.0%
Minibatch loss at step 500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 10.2%
Minibatch loss at step 1000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 14.6%
Minibatch loss at step 1500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 10.2%
Minibatch loss at step 2000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 17.7%
Minibatch loss at step 2500: 2.952326
Minibatch accuracy: 93.8%
Validation accuracy: 26.6%
Minibatch loss at step 3000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 17.5%
Test accuracy: 18.1%
It looks like it's overfitting. It gets close to 100% accuracy on the training data, but it only gets around 20% accuracy on the validation and testing data.
Is this the proper way to implement a 1-hidden layer neural network with rectified linear units? If so, how can I increase the accuracy?
Here are some suggestions that may improve your accuracy:
First of all, your hidden layer, which is of size 1024, seems too large. This may cause overfitting. I would try to reduce it to around 50-100 or so, see whether it improves and continue from there.
In addition, regarding this line:
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
0.5 learning rate might be too high, try to reduce it (to 0.01, 0.001 or so) and see what happens. Finally, you can also try to use tf.train.AdamOptimizer instead of tf.train.GradientDescentOptimizer, as in many cases it performs better.

nan loss when training a deep neural network in tensorflow tutorial

I am trying to train a neural network with more than 1 hidden layer on the notMNIST. When I have one hidden layer it works fine, but when I add multiple hidden layers I start getting nan for the loss. Here is the code i am using
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
batch_size = 128
num_hidden = 1024
num_hidden2 = 300
num_hidden3 = 50
SEED = 1234567
keep_prob = 0.5
graph1 = tf.Graph()
with graph1.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden]))
biases1 = tf.Variable(tf.zeros([num_hidden]))
weights2 = tf.Variable(tf.truncated_normal([num_hidden, num_hidden2]))
biases2 = tf.Variable(tf.zeros([num_hidden2]))
weights3 = tf.Variable(tf.truncated_normal([num_hidden2, num_hidden3]))
biases3 = tf.Variable(tf.zeros([num_hidden3]))
weights4 = tf.Variable(tf.truncated_normal([num_hidden3, num_labels]))
biases4 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
l1 = tf.matmul(tf_train_dataset, weights1) + biases1
h1 = tf.nn.relu(l1)
h1 = tf.nn.dropout(h1, 0.5, seed=SEED)
l2 = tf.matmul(h1, weights2) + biases2
h2 = tf.nn.relu(l2)
h2 = tf.nn.dropout(h2, 0.5, seed=SEED)
l3 = tf.matmul(h2, weights3) + biases3
h3 = tf.nn.relu(l3)
h3 = tf.nn.dropout(h3, 0.5, seed=SEED)
logits = tf.matmul(h3, weights4) + biases4
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# L2 regularization for the fully connected parameters.
regularizers = (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(biases1) +
tf.nn.l2_loss(weights2) + tf.nn.l2_loss(biases2) +
tf.nn.l2_loss(weights3) + tf.nn.l2_loss(biases3) +
tf.nn.l2_loss(weights4) + tf.nn.l2_loss(biases4))
# Add the regularization term to the loss.
loss += 5e-4 * regularizers
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
v_l1 = tf.matmul(tf_valid_dataset, weights1) + biases1
v_h1 = tf.nn.relu(v_l1)
v_l2 = tf.matmul(v_h1, weights2) + biases2
v_h2 = tf.nn.relu(v_l2)
v_l3 = tf.matmul(v_h2, weights3) + biases3
v_h3 = tf.nn.relu(v_l3)
v_logits = tf.matmul(v_h3, weights4) + biases4
valid_prediction = tf.nn.softmax(v_logits)
t_l1 = tf.matmul(tf_test_dataset, weights1) + biases1
t_h1 = tf.nn.relu(t_l1)
t_l2 = tf.matmul(t_h1, weights2) + biases2
t_h2 = tf.nn.relu(t_l2)
t_l3 = tf.matmul(t_h2, weights3) + biases3
t_h3 = tf.nn.relu(t_l3)
t_logits = tf.matmul(t_h3, weights4) + biases4
test_prediction = tf.nn.softmax(t_logits)
num_steps = 3001
with tf.Session(graph=graph1) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
This is the output I get
Initialized
Minibatch loss at step 0: 48759.078125
Minibatch accuracy: 10.2%
Validation accuracy: 10.0%
Minibatch loss at step 500: nan
Minibatch accuracy: 9.4%
Validation accuracy: 10.0%
Minibatch loss at step 1000: nan
Minibatch accuracy: 8.6%
Validation accuracy: 10.0%
Minibatch loss at step 1500: nan
Minibatch accuracy: 11.7%
Validation accuracy: 10.0%
Minibatch loss at step 2000: nan
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibatch loss at step 2500: nan
Minibatch accuracy: 10.2%
Validation accuracy: 10.0%
Minibatch loss at step 3000: nan
Minibatch accuracy: 7.8%
Validation accuracy: 10.0%
Test accuracy: 10.0%
try lowering the standard deviation of the weights. The default is set to 1. It worked for me.

Keras, sparse matrix issue

I am trying to feed a huge sparse matrix to Keras model. As the dataset doesn`t fit into RAM, the way around is to train the model on a data generated batch-by-batch by a generator.
To test this approach and make sure my solution works fine, I slightly modified a Kera`s simple MLP on the Reuters newswire topic classification task. So, the idea is to compare original and edited models. I just convert numpy.ndarray into scipy.sparse.csr.csr_matrix and feed it to the model.
But my model crashes at some point and I need a hand to figure out a reason.
Here is the original model and my additions below
from __future__ import print_function
import numpy as np
np.random.seed(1337) # for reproducibility
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
max_words = 1000
batch_size = 32
nb_epoch = 5
print('Loading data...')
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
nb_classes = np.max(y_train)+1
print(nb_classes, 'classes')
print('Vectorizing sequence data...')
tokenizer = Tokenizer(nb_words=max_words)
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = model.fit(X_train, Y_train,
nb_epoch=nb_epoch, batch_size=batch_size,
verbose=1)#, validation_split=0.1)
#score = model.evaluate(X_test, Y_test,
# batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])
It outputs:
Loading data...
8982 train sequences
2246 test sequences
46 classes
Vectorizing sequence data...
X_train shape: (8982, 1000)
X_test shape: (2246, 1000)
Convert class vector to binary class matrix (for use with categorical_crossentropy)
Y_train shape: (8982, 46)
Y_test shape: (2246, 46)
Building model...
Epoch 1/5
8982/8982 [==============================] - 5s - loss: 1.3932 - acc: 0.6906
Epoch 2/5
8982/8982 [==============================] - 4s - loss: 0.7522 - acc: 0.8234
Epoch 3/5
8982/8982 [==============================] - 5s - loss: 0.5407 - acc: 0.8681
Epoch 4/5
8982/8982 [==============================] - 5s - loss: 0.4160 - acc: 0.8980
Epoch 5/5
8982/8982 [==============================] - 5s - loss: 0.3338 - acc: 0.9136
Test score: 1.01453569163
Test accuracy: 0.797417631398
Finally, here is my part
X_train_sparse = sparse.csr_matrix(X_train)
def batch_generator(X, y, batch_size):
n_batches_for_epoch = X.shape[0]//batch_size
for i in range(n_batches_for_epoch):
index_batch = range(X.shape[0])[batch_size*i:batch_size*(i+1)]
X_batch = X[index_batch,:].todense()
y_batch = y[index_batch,:]
yield(np.array(X_batch),y_batch)
model.fit_generator(generator=batch_generator(X_train_sparse, Y_train, batch_size),
nb_epoch=nb_epoch,
samples_per_epoch=X_train_sparse.shape[0])
The crash:
Exception Traceback (most recent call last)
<ipython-input-120-6722a4f77425> in <module>()
1 model.fit_generator(generator=batch_generator(X_trainSparse, Y_train, batch_size),
2 nb_epoch=nb_epoch,
----> 3 samples_per_epoch=X_trainSparse.shape[0])
/home/kk/miniconda2/envs/tensorflow/lib/python2.7/site-packages/keras/models.pyc in fit_generator(self, generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size, **kwargs)
648 nb_val_samples=nb_val_samples,
649 class_weight=class_weight,
--> 650 max_q_size=max_q_size)
651
652 def evaluate_generator(self, generator, val_samples, max_q_size=10, **kwargs):
/home/kk/miniconda2/envs/tensorflow/lib/python2.7/site-packages/keras/engine/training.pyc in fit_generator(self, generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size)
1356 raise Exception('output of generator should be a tuple '
1357 '(x, y, sample_weight) '
-> 1358 'or (x, y). Found: ' + str(generator_output))
1359 if len(generator_output) == 2:
1360 x, y = generator_output
Exception: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None
I believe the problem is due to wrong setup of samples_per_epoch. I`d trully appreciate if someone could comment on this.
Here is my solution.
def batch_generator(X, y, batch_size):
number_of_batches = samples_per_epoch/batch_size
counter=0
shuffle_index = np.arange(np.shape(y)[0])
np.random.shuffle(shuffle_index)
X = X[shuffle_index, :]
y = y[shuffle_index]
while 1:
index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
X_batch = X[index_batch,:].todense()
y_batch = y[index_batch]
counter += 1
yield(np.array(X_batch),y_batch)
if (counter < number_of_batches):
np.random.shuffle(shuffle_index)
counter=0
In my case, X - sparse matrix, y - array.
If you can use Lasagne instead of Keras I've written a small MLP class with the following features:
supports both dense and sparse matrices
supports drop-out and hidden layer
Supports complete probability distribution instead of one-hot labels so supports multilabel training.
Supports scikit-learn like API (fit, predict, accuracy, etc.)
Is very easy to configure and modify

Resources