I'm new to pytorch and wrote a simple code as following to classify some inputs. The model input has 8*2 with batch size of 2 and the input layer in the model has 2 nodes. I don't know what is wrong!
X1=np.array([[2,1],[3,2],[-4,-1],[-1,-3],[2,-1],[3,-3],[-2,1],[-4,-2]])
Y1=np.array([0,0,0,0,1,1,1,1])
X=torch.tensor(X1)
Y=torch.tensor(Y1)
BATCH_SIZE=2
trainset= torch.utils.data.TensorDataset(X, Y)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
shuffle=True, num_workers=1)
from torch.nn.modules import flatten
learning_rate = 0.01
num_epochs = 20
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MyModel()
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
## compute accuracy
def get_accuracy(logit, target, batch_size):
''' Obtain accuracy for training round '''
corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
accuracy = 100.0 * corrects/batch_size
return accuracy.item()
model = MyModel()
# Commented out IPython magic to ensure Python compatibility.
for epoch in range(num_epochs):
train_running_loss = 0.0
train_acc = 0.0
## training step
for inputs, labels in trainloader:
#inputs=torch.flatten(inputs)
inputs,labels=inputs.to(device), labels.to(device)
#inputs = inputs.to(device)
#labels = labels.to(device)
optimizer.zero_grad()
## forward + backprop + loss
print(inputs)
outputs = model.forward(inputs)
loss = criterion(outputs, labels)
loss.backward()
## update model params
optimizer.step()
train_running_loss += loss.detach().item()
train_acc += get_accuracy(outputs, labels, BATCH_SIZE)
#model.train()
model.eval()
print('Epoch: %d | Loss: %.4f | Train Accuracy: %.2f'%(epoch, train_running_loss / i, train_acc/i))
And my model is as below:
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = nn.Linear(2,3)
self.d2 = nn.Linear(3,1)
self.init_weights()
def init_weights(self):
k1=torch.tensor([0.1,-0.72,0.94,-0.29,0.12,0.44])
k1=torch.unsqueeze(torch.unsqueeze(k1,0),0)
self.d1.weight.data=k1
k2=torch.tensor([1,-1.16,-0.26])
k2=torch.unsqueeze(torch.unsqueeze(k2,0),0)
self.d2.weight.data=k2
def forward(self, x):
x = self.d1(x)
x = F.tanh(x)
x = self.d2(x)
out = F.sigmoid(x)
return out
Then I got an error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-27-196d819d3ccd> in <module>
101 print(inputs)
102
--> 103 outputs = model.forward(inputs)
104 loss = criterion(outputs, labels)
105
2 frames
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/linear.py in forward(self, input)
112
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
115
116 def extra_repr(self) -> str:
RuntimeError: t() expects a tensor with <= 2 dimensions, but self is 3D
I flatten the input but nothing changed. What should I do to fix it?
First of all, you don't need to invoke your model's forward pass by model.forward(x); using model(x) is good.
Second of all, what exactly are you trying to achieve via the init_weights method? You're unsqueezing k1 and k2 twice, giving them the shape of (1, 1, x) which is 3D which is what the error is telling you. torch.nn.Linear performs a matrix multiplication with a 2D matrix, so you can't use a 3D one. torch.nn.Linear already initializes the weights via Kaiming initialization [1] so I'm not sure what you're trying to achieve here.
Changing the init_weights method to:
def init_weights(self):
k1 = torch.tensor([0.1, -0.72, 0.94, -0.29, 0.12, 0.44])
k1 = k1.reshape(self.d1.weight.shape)
self.d1.weight.data = k1
k2 = torch.tensor([1, -1.16, -0.26])
k2 = k2.reshape(self.d2.weight.shape)
self.d2.weight.data = k2
and changing the type of inputs from Long to Float (i.e., model(inputs.float())) should solve your problem.
References
[1] https://github.com/pytorch/pytorch/blob/0dceaf07cd1236859953b6f85a61dc4411d10f87/torch/nn/modules/linear.py#L103
Related
I am attempting to train EEG data through a transformer network. The input dimensions are 50x16684x60 (seq x batch x features) and the output is 16684x2. Right now I am simply trying to run a basic transformer, and I keep getting an error telling me
RuntimeError: the feature number of src and tgt must be equal to d_model
Why would the source and target feature number ever be equal? Is it possible to run such a dataset through a transformer?
Here is my basic model:
input_size = 60 # seq x batch x features
hidden_size = 32
num_classes = 2
learning_rate = 0.001
batch_size = 64
num_epochs = 2
sequence_length = 50
num_layers = 2
dropout = 0.5
class Transformer(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(Transformer, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.transformer = nn.Transformer(60, 2)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x, y):
# Forward Propogation
out, _ = self.transformer(x,y)
out = out.reshape(out.shape[0], -1)
out = self.fc(out)
return out
model = Transformer(input_size, hidden_size, num_layers, num_classes)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for index in tqdm(range(16684)):
X, y = (X_train[index], Y_train[index])
print(X.shape, y.shape)
output = model(X, y)
loss = criterion(output, y)
model.zero_grad()
loss.backward()
optimizer.step()
if index % 500 == 0:
print(f"Epoch {epoch}, Batch: {index}, Loss: {loss}")
You train the model to find some features by feeding it the input sequence and desired sequence. The backprop trains the net by computing the loss as a "difference" between src and target features.
If the features sizes aren't the same - the backprop can't find the accordance to some desired feature and the model can't be trained.
I want to run some neural net experiments with PyTorch, but a minimal test case is giving wrong answers. The test case sets up a simple neural network with two input variables and an output variable that is just the sum of the inputs, and tries learning it as a regression problem; I expect it to converge on zero mean squared error, but it actually converges on 0.165. It's probably because of the issue alluded to in the warning message; how can I fix it?
Code:
import torch
import torch.nn as nn
# data
Xs = []
ys = []
n = 10
for i in range(n):
i1 = i / n
for j in range(n):
j1 = j / n
Xs.append([i1, j1])
ys.append(i1 + j1)
# torch tensors
X_tensor = torch.tensor(Xs)
y_tensor = torch.tensor(ys)
# hyperparameters
in_features = len(Xs[0])
hidden_size = 100
out_features = 1
epochs = 500
# model
class Net(nn.Module):
def __init__(self, hidden_size):
super(Net, self).__init__()
self.L0 = nn.Linear(in_features, hidden_size)
self.N0 = nn.ReLU()
self.L1 = nn.Linear(hidden_size, 1)
def forward(self, x):
x = self.L0(x)
x = self.N0(x)
x = self.L1(x)
return x
model = Net(hidden_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
# train
print("training")
for epoch in range(1, epochs + 1):
# forward
output = model(X_tensor)
cost = criterion(output, y_tensor)
# backward
optimizer.zero_grad()
cost.backward()
optimizer.step()
# print progress
if epoch % (epochs // 10) == 0:
print(f"{epoch:6d} {cost.item():10f}")
print()
output = model(X_tensor)
cost = criterion(output, y_tensor)
print("mean squared error:", cost.item())
Output:
training
C:\Users\russe\Anaconda3\envs\torch2\lib\site-packages\torch\nn\modules\loss.py:445: UserWarning: Using a target size (torch.Size([100])) that is different to the input size (torch.Size([100, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
50 0.167574
100 0.165108
150 0.165070
200 0.165052
250 0.165039
300 0.165028
350 0.165020
400 0.165013
450 0.165009
500 0.165006
mean squared error: 0.1650056540966034
And the message:
UserWarning: Using a target size (torch.Size([100])) that is different to the input size (torch.Size([100, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
You're going to be a bit more specific on which tensors (X, or Y), but we can can reshape our tensors by using the torch.view function.
For example:
Y_tensor = torch.tensor(Ys)
print(Y_tensor.shape)
>> torch.Size([5])
new_shape = (len(Ys), 1)
Y_tensor = Y_tensor.view(new_shape)
print(Y_tensor.shape)
>> torch.Size([5, 1])
However, I'm skeptical that this broadcasting behavior is why you're having accuracy issues.
I am trying to implement Bayesian CNN using Mc Dropout on Pytorch, the main idea is that by applying dropout at test time and running over many forward passes, you get predictions from a variety of different models. I need to obtain the uncertainty, does anyone have an idea of how I can do it Please
This is how I defined my CNN
'''
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.dropout = nn.Dropout(p=0.3)
nn.init.xavier_uniform_(self.conv1.weight)
nn.init.constant_(self.conv1.bias, 0.0)
nn.init.xavier_uniform_(self.conv2.weight)
nn.init.constant_(self.conv2.bias, 0.0)
nn.init.xavier_uniform_(self.fc1.weight)
nn.init.constant_(self.fc1.bias, 0.0)
nn.init.xavier_uniform_(self.fc2.weight)
nn.init.constant_(self.fc2.bias, 0.0)
nn.init.xavier_uniform_(self.fc3.weight)
nn.init.constant_(self.fc3.bias, 0.0)
def forward(self, x):
x = self.pool(F.relu(self.dropout(self.conv1(x)))) # recommended to add the relu
x = self.pool(F.relu(self.dropout(self.conv2(x)))) # recommended to add the relu
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(self.dropout(x)))
x = self.fc3(self.dropout(x)) # no activation function needed for the last layer
return x
model = Net().to(device)
train_accuracies=np.zeros(num_epochs)
test_accuracies=np.zeros(num_epochs)
dataiter = iter(trainloader)
images, labels = dataiter.next()
#initializing variables
loss_acc = []
class_acc_mcdo = []
start_train = True
#Defining the Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
def train():
loss_vals = []
acc_vals = []
for epoch in range(num_epochs): # loop over the dataset multiple times
n_correct = 0 # initialize number of correct predictions
acc = 0 # initialize accuracy of each epoch
somme = 0 # initialize somme of losses of each epoch
epoch_loss = []
for i, (images, labels) in enumerate(trainloader):
# origin shape: [4, 3, 32, 32] = 4, 3, 1024
# input_layer: 3 input channels, 6 output channels, 5 kernel size
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model.train()(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad() # zero the parameter gradients
loss.backward()
epoch_loss.append(loss.item()) # add the loss to epoch_loss list
optimizer.step()
# max returns (value ,index)
_, predicted = torch.max(outputs, 1)
n_correct += (predicted == labels).sum().item()
# print statistics
if (i + 1) % 2000 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{n_total_steps}], Loss:
{loss.item():.4f}')
somme = (sum(epoch_loss)) / len(epoch_loss)
loss_vals.append(somme) # add the epoch's loss to loss_vals
print("Loss = {}".format(somme))
acc = 100 * n_correct / len(trainset)
acc_vals.append(acc) # add the epoch's Accuracy to acc_vals
print("Accuracy = {}".format(acc))
# SAVE
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)
loss_acc.append(loss_vals)
loss_acc.append(acc_vals)
return loss_acc
And here is the code of the mc dropout
'''
def enable_dropout(model):
""" Function to enable the dropout layers during test-time """
for m in model.modules():
if m.__class__.__name__.startswith('Dropout'):
m.train()
def test():
# set non-dropout layers to eval mode
model.eval()
# set dropout layers to train mode
enable_dropout(model)
test_loss = 0
correct = 0
n_samples = 0
n_class_correct = [0 for i in range(10)]
n_class_samples = [0 for i in range(10)]
T = 100
for images, labels in testloader:
images = images.to(device)
labels = labels.to(device)
with torch.no_grad():
output_list = []
# getting outputs for T forward passes
for i in range(T):
output_list.append(torch.unsqueeze(model(images), 0))
# calculating mean
output_mean = torch.cat(output_list, 0).mean(0)
test_loss += F.nll_loss(F.log_softmax(output_mean, dim=1), labels,
reduction='sum').data # sum up batch loss
_, predicted = torch.max(output_mean, 1) # get the index of the max log-probability
correct += (predicted == labels).sum().item() # sum up correct predictions
n_samples += labels.size(0)
for i in range(batch_size):
label = labels[i]
predi = predicted[i]
if (label == predi):
n_class_correct[label] += 1
n_class_samples[label] += 1
test_loss /= len(testloader.dataset)
# PRINT TO HTML PAGE
print('\n Average loss: {:.4f}, Accuracy: ({:.3f}%)\n'.format(
test_loss,
100. * correct / n_samples))
# Accuracy for each class
acc_classes = []
for i in range(10):
acc = 100.0 * n_class_correct[i] / n_class_samples[i]
print(f'Accuracy of {classes[i]}: {acc} %')
acc_classes.append(acc)
class_acc_mcdo.extend(acc_classes)
print('Finished Testing')
You can compute the statistics, such as the sample mean or the sample variance, of different stochastic forward passes at test time (i.e. with the test or validation data), when the dropout is enabled. These statistics can be used to represent uncertainty. For example, you can compute the entropy, which is a measure of uncertainty, from the sample mean.
I am new to pytorch. The following is the basic example of using nn module to train a simple one-layer model with some random data (from here)
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for t in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(t, loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
As far as I understand, the batch size is equal to 1 in the example, in other words, a single point (out of 64) is used to calculate gradients and update parameters. My question is: how to modify this example to train the model with the batch size greater than one?
In fact N is the batch size. So you just need to modify N currently its set to 64. So you have in every training batch 64 vectors with size / dim D_in.
I checked the link you posted, you can also take a look at the comments - there is some explanation too :)
# -*- coding: utf-8 -*-
import numpy as np
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)
# Compute and print loss
loss = np.square(y_pred - y).sum()
print(t, loss)
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0
grad_w1 = x.T.dot(grad_h)
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
To include batch size in PyTorch basic examples, the easiest and cleanest way is to use PyTorch torch.utils.data.DataLoader and torch.utils.data.TensorDataset.
Dataset stores the samples and their corresponding labels, and DataLoader wraps an iterable around the Dataset to enable easy access to the samples.
DataLoader will take care of creating batches for you.
Building on your question, there is a complete code snippet, where we iterate over a dataset of 10000 examples for 2 epochs with a batch size of 64:
import torch
from torch.utils.data import DataLoader, TensorDataset
# Create the dataset with N_SAMPLES samples
N_SAMPLES, D_in, H, D_out = 10000, 1000, 100, 10
x = torch.randn(N_SAMPLES, D_in)
y = torch.randn(N_SAMPLES, D_out)
# Define the batch size and the number of epochs
BATCH_SIZE = 64
N_EPOCHS = 2
# Use torch.utils.data to create a DataLoader
# that will take care of creating batches
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
# Define model, loss and optimizer
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# Get the dataset size for printing (it is equal to N_SAMPLES)
dataset_size = len(dataloader.dataset)
# Loop over epochs
for epoch in range(N_EPOCHS):
print(f"Epoch {epoch + 1}\n-------------------------------")
# Loop over batches in an epoch using DataLoader
for id_batch, (x_batch, y_batch) in enumerate(dataloader):
y_batch_pred = model(x_batch)
loss = loss_fn(y_batch_pred, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Every 100 batches, print the loss for this batch
# as well as the number of examples processed so far
if id_batch % 100 == 0:
loss, current = loss.item(), (id_batch + 1)* len(x_batch)
print(f"loss: {loss:>7f} [{current:>5d}/{dataset_size:>5d}]")
The output should be something like:
Epoch 1
-------------------------------
loss: 643.433716 [ 64/10000]
loss: 648.195435 [ 6464/10000]
Epoch 2
-------------------------------
loss: 613.619873 [ 64/10000]
loss: 625.018555 [ 6464/10000]
I was reading the original paper on BN and the stack overflow question on How could I use Batch Normalization in TensorFlow? which provides a very useful piece of code to insert a batch normalization block to a Neural Network but does not provides enough guidance on how to actually use it during training, inference and when evaluating models.
For example, I would like to track the train error during training and test error to make sure I don't overfit. Its clear that the batch normalization block should be off during test, but when evaluating the error on the training set, should the batch normalization block be turned off too? My main questions are:
During inference and error evaluation, should the batch normalization block be turned off regardless of the data set?
Does that mean that the batch normalization block should only be on during the training step then?
To make it very clear, I will provide an extract (of simplified) code I have been using to run batch normalization with Tensor flow according to what is my understanding of what is the right thing to do:
## TRAIN
if phase_train is not None:
#DO BN
feed_dict_train = {x:X_train, y_:Y_train, phase_train: False}
feed_dict_cv = {x:X_cv, y_:Y_cv, phase_train: False}
feed_dict_test = {x:X_test, y_:Y_test, phase_train: False}
else:
#Don't do BN
feed_dict_train = {x:X_train, y_:Y_train}
feed_dict_cv = {x:X_cv, y_:Y_cv}
feed_dict_test = {x:X_test, y_:Y_test}
def get_batch_feed(X, Y, M, phase_train):
mini_batch_indices = np.random.randint(M,size=M)
Xminibatch = X[mini_batch_indices,:] # ( M x D^(0) )
Yminibatch = Y[mini_batch_indices,:] # ( M x D^(L) )
if phase_train is not None:
#DO BN
feed_dict = {x: Xminibatch, y_: Yminibatch, phase_train: True}
else:
#Don't do BN
feed_dict = {x: Xminibatch, y_: Yminibatch}
return feed_dict
with tf.Session() as sess:
sess.run( tf.initialize_all_variables() )
for iter_step in xrange(steps):
feed_dict_batch = get_batch_feed(X_train, Y_train, M, phase_train)
# Collect model statistics
if iter_step%report_error_freq == 0:
train_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_train)
cv_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_cv)
test_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_test)
do_stuff_with_errors(train_error, cv_error, test_error)
# Run Train Step
sess.run(fetches=train_step, feed_dict=feed_dict_batch)
and the code I am using to produce batch normalization blocks is:
def standard_batch_norm(l, x, n_out, phase_train, scope='BN'):
"""
Batch normalization on feedforward maps.
Args:
x: Vector
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope+l):
#beta = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float64 ), name='beta', trainable=True, dtype=tf.float64 )
#gamma = tf.Variable(tf.constant(1.0, shape=[n_out],dtype=tf.float64 ), name='gamma', trainable=True, dtype=tf.float64 )
init_beta = tf.constant(0.0, shape=[n_out], dtype=tf.float64)
init_gamma = tf.constant(1.0, shape=[n_out],dtype=tf.float64)
beta = tf.get_variable(name='beta'+l, dtype=tf.float64, initializer=init_beta, regularizer=None, trainable=True)
gamma = tf.get_variable(name='gamma'+l, dtype=tf.float64, initializer=init_gamma, regularizer=None, trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
I found that there is 'official' batch_norm layer in tensorflow. Try it out:
https://github.com/tensorflow/tensorflow/blob/b826b79718e3e93148c3545e7aa3f90891744cc0/tensorflow/contrib/layers/python/layers/layers.py#L100
Most likely it is not mentioned in docs since it included in some RC or 'beta' version only.
I haven't inspected deep into this matter yet, but as far as I see from documentation you just use binary parameter is_training in this batch_norm layer, and set it to true only for training phase. Try it out.
UPDATE: Below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine.
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))