Why does the same PyTorch code (different implementation) give different loss?

Why does the same PyTorch code (different implementation) give different loss? - machine-learning

I was tackling the Fashion MNIST data-set problem on Udacity. However my implementation of code is giving drastically different loss as compared to the solution shared by the Udacity team. I believe the only difference in my answer is the definition of the Neural Network and apart from that everything is the same. I am not able to figure out the reason for such a drastic difference in Loss.
Code 1: My solution:
import torch.nn as nn
from torch import optim
images, labels = next(iter(trainloader))
model = nn.Sequential(nn.Linear(784,256),
nn.ReLU(),
nn.Linear(256,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,10),
nn.LogSoftmax(dim=1))
# Flatten images
optimizer = optim.Adam(model.parameters(),lr=0.003)
criterion = nn.NLLLoss()
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 4000
Code 2: Official Solution:
from torch import nn, optim
import torch.nn.functional as F
class Classifier(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 64)
self.fc4 = nn.Linear(64, 10)
def forward(self, x):
# make sure input tensor is flattened
x = x.view(x.shape[0], -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.log_softmax(self.fc4(x), dim=1)
return x
model = Classifier()
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
epochs = 5
for e in range(epochs):
running_loss = 0
for images, labels in trainloader:
log_ps = model(images)
loss = criterion(log_ps, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 200
Is there any explanation for the vast difference in loss ?

You forgot to zero out/clear the gradients in your implementation. That is you are missing :
optimizer.zero_grad()
In other words simply do:
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
# missed this!
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
and you are good to go!

Related

Loss Not Decreasing pytorch

I am making a simple CNN without a pooling layer and optimizer (not allowed since its a college assignment). I am using the SVHN dataset. I am using crossentropy loss but loss keeps jumping between 2.305 and 2.306 in 10 epochs. Kindly help.
class Net(nn.Module):
def __init__(self):
super(Net,self).__init__()
self.conv1 = nn.Conv2d(3, 32,kernel_size=3, padding=1)
# self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.fc1 = nn.Sequential(
nn.Flatten(),
nn.Linear(64*32*32, 10)
)
def forward(self, x):
x = (F.relu(self.conv1(x)))
x = (F.relu(self.conv2(x)))
x = self.fc1(x)
return x
net = Net()
criterion = nn.CrossEntropyLoss()
for epoch in range(10): # loop over the dataset multiple times
net.train()
batch_loss_val=0
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# forward + backward
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
# optimizer.step()
# print statistics
running_loss += loss.item()
if i % 50 == 0: # print every 50 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 50 :.3f}')
running_loss = 0.0
print('Finished Training')

You need to enable optimizer.step() to update the weights. loss.backward() is only computing the gradients.

validation loss is not changing at all (pytorch)

I'm using PyTorch for the first time to train my sentiment analysts model using Bert's pre-trained model.
this is my classifier
class SentimentClassifier2(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier2, self).__init__()
D_in, H, D_out = 768, 200, 3
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.4)
self.classifier = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
nn.Linear(H, D_out)
)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
output = self.drop(pooled_output)
logits = self.classifier(output)
return logits
this is my optimizer/loss function (I'm doing only 20 epochs cause it takes a while to train )
EPOCHS = 20
model2 = SentimentClassifier2(len(class_names))
model2= model2.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=True)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
training & evaluation code
def train_epoch( model, data_loader, loss_fn,optimizer, device, scheduler, n_examples):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
my problem: the loss for the validation samples is not changing at all !!!
epoch1:______________________
Train loss 1.0145157482929346 accuracy 0.4185746994848311
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch2:______________________
Train loss 1.015038197996413 accuracy 0.41871780194619346
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch3:______________________
Train loss 1.014710763787351 accuracy 0.4188609044075558
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch4:______________________
Train loss 1.0139196826735648 accuracy 0.41909940850982635
Val loss 1.002384223589083 accuracy 0.4151087371232354
I don't understand what the problem is ...!
I would be grateful if someone could help me ☹

Maybe you can try to it. following，
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.zero_grad()
optimizer.step()
scheduler.step()

Why is my pytorch classification model not learning?

I have created a simple pytorch classification model with sample datasets generated using sklearns make_classification. Even after training for thousands of epochs the accuracy of the model hovers between 30 and 40 percentage. During training itself the loss value is fluctuating very far and wide. I am wondering why this model is not learning, whether it's due to some logical error in the code.
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X,y = make_classification(n_features=15,n_classes=5,n_informative=4)
DEVICE = torch.device('cuda')
epochs = 5000
class CustomDataset(Dataset):
def __init__(self,X,y):
self.X = torch.from_numpy(X)
self.y = torch.from_numpy(y)
def __len__(self):
return len(self.X)
def __getitem__(self, index):
X = self.X[index]
y = self.y[index]
return (X,y)
class Model(nn.Module):
def __init__(self):
super().__init__()
self.l1 = nn.Linear(15,10)
self.l2 = nn.Linear(10,5)
self.relu = nn.ReLU()
def forward(self,x):
x = self.l1(x)
x = self.relu(x)
x = self.l2(x)
x = self.relu(x)
return x
model = Model().double().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
train_data = CustomDataset(X_train,y_train)
test_data = CustomDataset(X_test,y_test)
trainloader = DataLoader(train_data, batch_size=32, shuffle=True)
testloader = DataLoader(test_data, batch_size=32, shuffle=True)
for i in range(epochs):
for (x,y) in trainloader:
x = x.to(DEVICE)
y = y.to(DEVICE)
optimizer.zero_grad()
output = model(x)
loss = loss_function(output,y)
loss.backward()
optimizer.step()
if i%200==0:
print("epoch: ",i," Loss: ",loss.item())
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for x, y in testloader:
# calculate outputs by running x through the network
outputs = model(x.to(DEVICE)).to(DEVICE)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += y.size(0)
correct += (predicted == y.to(DEVICE)).sum().item()
print(f'Accuracy of the network on the test data: {100 * correct // total} %')
EDIT
I tried to over-fit my model with only 10 samples (batch_size=5) X,y = make_classification(n_samples=10,n_features=15,n_classes=5,n_informative=4) but now the accuracy decreased to 15-20%. I then normalize the input data between the values 0 and 1 which pushed the accuracy a bit higher but not over 50 percentage. Any idea why this might be happening?

You should not be using ReLU activation on your output layer. Usually softmax activation is used for multi class classification on the final layer, or the logits are fed to the loss function directly without explicitly adding a softmax activation layer.
Try removing the ReLU activation from the final layer.

Accuracy value goes up and down on the training process

After training the network I noticed that accuracy goes up and down. Initially I thought it is caused by the learning rate, but it is set to quite small value. Please check the screenshot attached.
Plot Accuracy Screenshot
My network (in Pytorch) looks as follow:
class Network(nn.Module):
def __init__(self):
super(Network,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
out = torch.sigmoid(out)
return out
I am using RMSprop as optimizer and BCELoss as criterion. The learning rate is set to 0.001
Here is the training process:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
if itr%p_itr == 0:
pred = torch.argmax(output, dim=1)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
My dataset is quite small - 2000 train and 1000 validation (binary classification 0/1). I wanted to do the 80/20 split but I was asked to keep it like that. I was thinking that the architecture might be too complex for such a small dataset.
Any hits what may cause such jumps in the training process?

Your code here is wrong: pred = torch.argmax(output, dim=1)
This line using for multiclass classification with Cross-Entropy Loss.
Your task is binary classification so the pred values are wrong. Change to:
if itr%p_itr == 0:
pred = torch.round(output)
....
You can change your optimizer to Adam, SGD, or RMSprop to find the suitable optimizer that helps your model coverage faster.
Also change the forward() function:
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return self.sigmoid(out) #use your forward is ok, but this cleaner

How to include batch size in pytorch basic example?

I am new to pytorch. The following is the basic example of using nn module to train a simple one-layer model with some random data (from here)
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for t in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(t, loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
As far as I understand, the batch size is equal to 1 in the example, in other words, a single point (out of 64) is used to calculate gradients and update parameters. My question is: how to modify this example to train the model with the batch size greater than one?

In fact N is the batch size. So you just need to modify N currently its set to 64. So you have in every training batch 64 vectors with size / dim D_in.
I checked the link you posted, you can also take a look at the comments - there is some explanation too :)
# -*- coding: utf-8 -*-
import numpy as np
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)
# Compute and print loss
loss = np.square(y_pred - y).sum()
print(t, loss)
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0
grad_w1 = x.T.dot(grad_h)
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2

To include batch size in PyTorch basic examples, the easiest and cleanest way is to use PyTorch torch.utils.data.DataLoader and torch.utils.data.TensorDataset.
Dataset stores the samples and their corresponding labels, and DataLoader wraps an iterable around the Dataset to enable easy access to the samples.
DataLoader will take care of creating batches for you.
Building on your question, there is a complete code snippet, where we iterate over a dataset of 10000 examples for 2 epochs with a batch size of 64:
import torch
from torch.utils.data import DataLoader, TensorDataset
# Create the dataset with N_SAMPLES samples
N_SAMPLES, D_in, H, D_out = 10000, 1000, 100, 10
x = torch.randn(N_SAMPLES, D_in)
y = torch.randn(N_SAMPLES, D_out)
# Define the batch size and the number of epochs
BATCH_SIZE = 64
N_EPOCHS = 2
# Use torch.utils.data to create a DataLoader
# that will take care of creating batches
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
# Define model, loss and optimizer
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# Get the dataset size for printing (it is equal to N_SAMPLES)
dataset_size = len(dataloader.dataset)
# Loop over epochs
for epoch in range(N_EPOCHS):
print(f"Epoch {epoch + 1}\n-------------------------------")
# Loop over batches in an epoch using DataLoader
for id_batch, (x_batch, y_batch) in enumerate(dataloader):
y_batch_pred = model(x_batch)
loss = loss_fn(y_batch_pred, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Every 100 batches, print the loss for this batch
# as well as the number of examples processed so far
if id_batch % 100 == 0:
loss, current = loss.item(), (id_batch + 1)* len(x_batch)
print(f"loss: {loss:>7f} [{current:>5d}/{dataset_size:>5d}]")
The output should be something like:
Epoch 1
-------------------------------
loss: 643.433716 [ 64/10000]
loss: 648.195435 [ 6464/10000]
Epoch 2
-------------------------------
loss: 613.619873 [ 64/10000]
loss: 625.018555 [ 6464/10000]

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Why does the same PyTorch code (different implementation) give different loss? - machine-learning

Related

Loss Not Decreasing pytorch

validation loss is not changing at all (pytorch)

Why is my pytorch classification model not learning?

Accuracy value goes up and down on the training process

How to include batch size in pytorch basic example?

Categories

Resources