Loss Not Decreasing pytorch - machine-learning

I am making a simple CNN without a pooling layer and optimizer (not allowed since its a college assignment). I am using the SVHN dataset. I am using crossentropy loss but loss keeps jumping between 2.305 and 2.306 in 10 epochs. Kindly help.
class Net(nn.Module):
def __init__(self):
super(Net,self).__init__()
self.conv1 = nn.Conv2d(3, 32,kernel_size=3, padding=1)
# self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.fc1 = nn.Sequential(
nn.Flatten(),
nn.Linear(64*32*32, 10)
)
def forward(self, x):
x = (F.relu(self.conv1(x)))
x = (F.relu(self.conv2(x)))
x = self.fc1(x)
return x
net = Net()
criterion = nn.CrossEntropyLoss()
for epoch in range(10): # loop over the dataset multiple times
net.train()
batch_loss_val=0
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# forward + backward
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
# optimizer.step()
# print statistics
running_loss += loss.item()
if i % 50 == 0: # print every 50 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 50 :.3f}')
running_loss = 0.0
print('Finished Training')

You need to enable optimizer.step() to update the weights. loss.backward() is only computing the gradients.

Related

Accuracy value goes up and down on the training process

After training the network I noticed that accuracy goes up and down. Initially I thought it is caused by the learning rate, but it is set to quite small value. Please check the screenshot attached.
Plot Accuracy Screenshot
My network (in Pytorch) looks as follow:
class Network(nn.Module):
def __init__(self):
super(Network,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
out = torch.sigmoid(out)
return out
I am using RMSprop as optimizer and BCELoss as criterion. The learning rate is set to 0.001
Here is the training process:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
if itr%p_itr == 0:
pred = torch.argmax(output, dim=1)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
My dataset is quite small - 2000 train and 1000 validation (binary classification 0/1). I wanted to do the 80/20 split but I was asked to keep it like that. I was thinking that the architecture might be too complex for such a small dataset.
Any hits what may cause such jumps in the training process?
Your code here is wrong: pred = torch.argmax(output, dim=1)
This line using for multiclass classification with Cross-Entropy Loss.
Your task is binary classification so the pred values are wrong. Change to:
if itr%p_itr == 0:
pred = torch.round(output)
....
You can change your optimizer to Adam, SGD, or RMSprop to find the suitable optimizer that helps your model coverage faster.
Also change the forward() function:
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return self.sigmoid(out) #use your forward is ok, but this cleaner

tgt and src have to have equal features for a Transformer Network in Pytorch

I am attempting to train EEG data through a transformer network. The input dimensions are 50x16684x60 (seq x batch x features) and the output is 16684x2. Right now I am simply trying to run a basic transformer, and I keep getting an error telling me
RuntimeError: the feature number of src and tgt must be equal to d_model
Why would the source and target feature number ever be equal? Is it possible to run such a dataset through a transformer?
Here is my basic model:
input_size = 60 # seq x batch x features
hidden_size = 32
num_classes = 2
learning_rate = 0.001
batch_size = 64
num_epochs = 2
sequence_length = 50
num_layers = 2
dropout = 0.5
class Transformer(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(Transformer, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.transformer = nn.Transformer(60, 2)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x, y):
# Forward Propogation
out, _ = self.transformer(x,y)
out = out.reshape(out.shape[0], -1)
out = self.fc(out)
return out
model = Transformer(input_size, hidden_size, num_layers, num_classes)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for index in tqdm(range(16684)):
X, y = (X_train[index], Y_train[index])
print(X.shape, y.shape)
output = model(X, y)
loss = criterion(output, y)
model.zero_grad()
loss.backward()
optimizer.step()
if index % 500 == 0:
print(f"Epoch {epoch}, Batch: {index}, Loss: {loss}")
You train the model to find some features by feeding it the input sequence and desired sequence. The backprop trains the net by computing the loss as a "difference" between src and target features.
If the features sizes aren't the same - the backprop can't find the accordance to some desired feature and the model can't be trained.

Why does the same PyTorch code (different implementation) give different loss?

I was tackling the Fashion MNIST data-set problem on Udacity. However my implementation of code is giving drastically different loss as compared to the solution shared by the Udacity team. I believe the only difference in my answer is the definition of the Neural Network and apart from that everything is the same. I am not able to figure out the reason for such a drastic difference in Loss.
Code 1: My solution:
import torch.nn as nn
from torch import optim
images, labels = next(iter(trainloader))
model = nn.Sequential(nn.Linear(784,256),
nn.ReLU(),
nn.Linear(256,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,10),
nn.LogSoftmax(dim=1))
# Flatten images
optimizer = optim.Adam(model.parameters(),lr=0.003)
criterion = nn.NLLLoss()
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 4000
Code 2: Official Solution:
from torch import nn, optim
import torch.nn.functional as F
class Classifier(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 64)
self.fc4 = nn.Linear(64, 10)
def forward(self, x):
# make sure input tensor is flattened
x = x.view(x.shape[0], -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.log_softmax(self.fc4(x), dim=1)
return x
model = Classifier()
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
epochs = 5
for e in range(epochs):
running_loss = 0
for images, labels in trainloader:
log_ps = model(images)
loss = criterion(log_ps, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 200
Is there any explanation for the vast difference in loss ?
You forgot to zero out/clear the gradients in your implementation. That is you are missing :
optimizer.zero_grad()
In other words simply do:
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
# missed this!
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
and you are good to go!

How to compute the uncertainty of a Monte Carlo Dropout neural network with PyTorch?

I am trying to implement Bayesian CNN using Mc Dropout on Pytorch, the main idea is that by applying dropout at test time and running over many forward passes, you get predictions from a variety of different models. I need to obtain the uncertainty, does anyone have an idea of how I can do it Please
This is how I defined my CNN
'''
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.dropout = nn.Dropout(p=0.3)
nn.init.xavier_uniform_(self.conv1.weight)
nn.init.constant_(self.conv1.bias, 0.0)
nn.init.xavier_uniform_(self.conv2.weight)
nn.init.constant_(self.conv2.bias, 0.0)
nn.init.xavier_uniform_(self.fc1.weight)
nn.init.constant_(self.fc1.bias, 0.0)
nn.init.xavier_uniform_(self.fc2.weight)
nn.init.constant_(self.fc2.bias, 0.0)
nn.init.xavier_uniform_(self.fc3.weight)
nn.init.constant_(self.fc3.bias, 0.0)
def forward(self, x):
x = self.pool(F.relu(self.dropout(self.conv1(x)))) # recommended to add the relu
x = self.pool(F.relu(self.dropout(self.conv2(x)))) # recommended to add the relu
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(self.dropout(x)))
x = self.fc3(self.dropout(x)) # no activation function needed for the last layer
return x
model = Net().to(device)
train_accuracies=np.zeros(num_epochs)
test_accuracies=np.zeros(num_epochs)
dataiter = iter(trainloader)
images, labels = dataiter.next()
#initializing variables
loss_acc = []
class_acc_mcdo = []
start_train = True
#Defining the Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
def train():
loss_vals = []
acc_vals = []
for epoch in range(num_epochs): # loop over the dataset multiple times
n_correct = 0 # initialize number of correct predictions
acc = 0 # initialize accuracy of each epoch
somme = 0 # initialize somme of losses of each epoch
epoch_loss = []
for i, (images, labels) in enumerate(trainloader):
# origin shape: [4, 3, 32, 32] = 4, 3, 1024
# input_layer: 3 input channels, 6 output channels, 5 kernel size
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model.train()(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad() # zero the parameter gradients
loss.backward()
epoch_loss.append(loss.item()) # add the loss to epoch_loss list
optimizer.step()
# max returns (value ,index)
_, predicted = torch.max(outputs, 1)
n_correct += (predicted == labels).sum().item()
# print statistics
if (i + 1) % 2000 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{n_total_steps}], Loss:
{loss.item():.4f}')
somme = (sum(epoch_loss)) / len(epoch_loss)
loss_vals.append(somme) # add the epoch's loss to loss_vals
print("Loss = {}".format(somme))
acc = 100 * n_correct / len(trainset)
acc_vals.append(acc) # add the epoch's Accuracy to acc_vals
print("Accuracy = {}".format(acc))
# SAVE
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)
loss_acc.append(loss_vals)
loss_acc.append(acc_vals)
return loss_acc
And here is the code of the mc dropout
'''
def enable_dropout(model):
""" Function to enable the dropout layers during test-time """
for m in model.modules():
if m.__class__.__name__.startswith('Dropout'):
m.train()
def test():
# set non-dropout layers to eval mode
model.eval()
# set dropout layers to train mode
enable_dropout(model)
test_loss = 0
correct = 0
n_samples = 0
n_class_correct = [0 for i in range(10)]
n_class_samples = [0 for i in range(10)]
T = 100
for images, labels in testloader:
images = images.to(device)
labels = labels.to(device)
with torch.no_grad():
output_list = []
# getting outputs for T forward passes
for i in range(T):
output_list.append(torch.unsqueeze(model(images), 0))
# calculating mean
output_mean = torch.cat(output_list, 0).mean(0)
test_loss += F.nll_loss(F.log_softmax(output_mean, dim=1), labels,
reduction='sum').data # sum up batch loss
_, predicted = torch.max(output_mean, 1) # get the index of the max log-probability
correct += (predicted == labels).sum().item() # sum up correct predictions
n_samples += labels.size(0)
for i in range(batch_size):
label = labels[i]
predi = predicted[i]
if (label == predi):
n_class_correct[label] += 1
n_class_samples[label] += 1
test_loss /= len(testloader.dataset)
# PRINT TO HTML PAGE
print('\n Average loss: {:.4f}, Accuracy: ({:.3f}%)\n'.format(
test_loss,
100. * correct / n_samples))
# Accuracy for each class
acc_classes = []
for i in range(10):
acc = 100.0 * n_class_correct[i] / n_class_samples[i]
print(f'Accuracy of {classes[i]}: {acc} %')
acc_classes.append(acc)
class_acc_mcdo.extend(acc_classes)
print('Finished Testing')
You can compute the statistics, such as the sample mean or the sample variance, of different stochastic forward passes at test time (i.e. with the test or validation data), when the dropout is enabled. These statistics can be used to represent uncertainty. For example, you can compute the entropy, which is a measure of uncertainty, from the sample mean.

How to include batch size in pytorch basic example?

I am new to pytorch. The following is the basic example of using nn module to train a simple one-layer model with some random data (from here)
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for t in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(t, loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
As far as I understand, the batch size is equal to 1 in the example, in other words, a single point (out of 64) is used to calculate gradients and update parameters. My question is: how to modify this example to train the model with the batch size greater than one?
In fact N is the batch size. So you just need to modify N currently its set to 64. So you have in every training batch 64 vectors with size / dim D_in.
I checked the link you posted, you can also take a look at the comments - there is some explanation too :)
# -*- coding: utf-8 -*-
import numpy as np
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)
# Compute and print loss
loss = np.square(y_pred - y).sum()
print(t, loss)
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0
grad_w1 = x.T.dot(grad_h)
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
To include batch size in PyTorch basic examples, the easiest and cleanest way is to use PyTorch torch.utils.data.DataLoader and torch.utils.data.TensorDataset.
Dataset stores the samples and their corresponding labels, and DataLoader wraps an iterable around the Dataset to enable easy access to the samples.
DataLoader will take care of creating batches for you.
Building on your question, there is a complete code snippet, where we iterate over a dataset of 10000 examples for 2 epochs with a batch size of 64:
import torch
from torch.utils.data import DataLoader, TensorDataset
# Create the dataset with N_SAMPLES samples
N_SAMPLES, D_in, H, D_out = 10000, 1000, 100, 10
x = torch.randn(N_SAMPLES, D_in)
y = torch.randn(N_SAMPLES, D_out)
# Define the batch size and the number of epochs
BATCH_SIZE = 64
N_EPOCHS = 2
# Use torch.utils.data to create a DataLoader
# that will take care of creating batches
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
# Define model, loss and optimizer
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# Get the dataset size for printing (it is equal to N_SAMPLES)
dataset_size = len(dataloader.dataset)
# Loop over epochs
for epoch in range(N_EPOCHS):
print(f"Epoch {epoch + 1}\n-------------------------------")
# Loop over batches in an epoch using DataLoader
for id_batch, (x_batch, y_batch) in enumerate(dataloader):
y_batch_pred = model(x_batch)
loss = loss_fn(y_batch_pred, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Every 100 batches, print the loss for this batch
# as well as the number of examples processed so far
if id_batch % 100 == 0:
loss, current = loss.item(), (id_batch + 1)* len(x_batch)
print(f"loss: {loss:>7f} [{current:>5d}/{dataset_size:>5d}]")
The output should be something like:
Epoch 1
-------------------------------
loss: 643.433716 [ 64/10000]
loss: 648.195435 [ 6464/10000]
Epoch 2
-------------------------------
loss: 613.619873 [ 64/10000]
loss: 625.018555 [ 6464/10000]

Resources