Accuracy changes drastically on changing from CPU to GPU - machine-learning

So I was working on the SIIM Melanoma classification data on Kaggle. While training a number of networks I found that when I trained them on CPU, accuracy seemed to be appropriate, around 0.75. On switching to GPU, the accuracy would oscillate in and around 0.5. What should I do about this? Here's a code snippet of the training loop. Model trained finally was resnext50.
import cv2
device = "cpu"
import torch.nn.functional as F
epochs=3
#model = torch.load("model.pt")
model.cpu()
#model.cuda()
print("======== Training for ", epochs, "epochs=============")
for epoch in range(epochs):
total_loss = 0
model.train()
print("Training.......")
print("======== EPOCH #",epoch,"=================")
tmp_acc = 0
for i,batch in enumerate(train_loader):
img,label = batch["images"],batch["labels"]
#img = img.permute(0,3,1,2)
#img = torch.Tensor(img)
label = label.type(torch.FloatTensor)
img,label = img.to(device),label.to(device)
model.zero_grad()
op = model(img)
label_cpu = label.cpu().numpy()
op = F.sigmoid(op)
output = op.detach().cpu().numpy()
tmp_acc += accuracy_score(output,label_cpu)
loss = criterion(op,label)
total_loss = loss.item()
loss.backward()
adam.step()
if(i%10==0 and i>0):
print("STEP: ",i, "of steps ",len(train_loader))
print("Current loss: ",total_loss/i)
print("Training Accuracy ",tmp_acc/i)
avg_loss = total_loss/len(train_loader)
print("The loss after ",epoch," epochs is ",avg_loss)
print("OP",op)
print("Label",label_cpu)
torch.save(model.state_dict(),"/kaggle/working/model.pt")

Related

Constantly separated validation & training losses

I've worked with Autoencoders for some weeks now, but I've seem to hit a rock wall when it comes to my understanding of losses overall. The issue I'm facing is that when trying to implement Batchnormalization & Dropout layers to my model, I get losses which aren't converging and awful reconstructions. A typical loss plot is something like this:
and the losses I use is an L1 regularization with MSE loss and looks something like this
def L1_loss_fcn(model_children, true_data, reconstructed_data, reg_param=0.1, validate):
mse = nn.MSELoss()
mse_loss = mse(reconstructed_data, true_data)
l1_loss = 0
values = true_data
if validate == False:
for i in range(len(model_children)):
values = F.relu((model_children[i](values)))
l1_loss += torch.sum(torch.abs(values))
loss = mse_loss + reg_param * l1_loss
return loss, mse_loss, l1_loss
else:
return mse_loss
with my training loop written as:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_run_loss = 0
val_run_loss = 0
for epoch in range(epochs):
print(f"Epoch {epoch + 1} of {epochs}")
# TRAINING
model.train()
for data in tqdm(train_dl):
x, _ = data
reconstructions = model(x)
optimizer.zero_grad()
train_loss, mse_loss, l1_loss =L1_loss_fcn(model_children=model_children, true_data=x,reg_param=regular_param, reconstructed_data=reconstructions, validate=False)
train_loss.backward()
optimizer.step()
train_run_loss += train_loss.item()
# VALIDATING
model.eval()
with torch.no_grad():
for data in tqdm(test_dl):
x, _ = data
reconstructions = model(x)
val_loss = L1_loss_fcn(model_children=model_children, true_data=x, reg_param=regular_param, reconstructed_data = reconstructions, validate = True)
val_run_loss += val_loss.item()
epoch_loss_train = train_run_loss / len(train_dl)
epoch_loss_val = val_run_loss / len(test_dl)
where I've tried different hyper-parameter values without luck. My model looks something like this,
encoder = nn.Sequential(nn.Linear(), nn.Dropout(p=0.5), nn.LeakyReLU(), nn.BatchNorm1d(),
nn.Linear(), nn.Dropout(p=0.4), nn.LeakyReLU(), nn.BatchNorm1d(),
nn.Linear(), nn.Dropout(p=0.3), nn.LeakyReLU(), nn.BatchNorm1d(),
nn.Linear(), nn.Dropout(p=0.2), nn.LeakyReLU(), nn.BatchNorm1d(),
)
decoder = nn.Sequential(nn.Linear(), nn.Dropout(p=0.2), nn.LeakyReLU(),
nn.Linear(), nn.Dropout(p=0.3), nn.LeakyReLU(),
nn.Linear(), nn.Dropout(p=0.4), nn.LeakyReLU(),
nn.Linear(), nn.Dropout(p=0.5), nn.ReLU(),
)
What I expect to find is a converging train & validation loss, and thereby a lot better reconstructions overall, but I think that I'm missing something quite grave I'm afraid. Some help would be greatly appreciated!
You are not comparing apples to apples, your code reads
l1_loss = 0
values = true_data
if validate == False:
for i in range(len(model_children)):
values = F.relu((model_children[i](values)))
l1_loss += torch.sum(torch.abs(values))
loss = mse_loss + reg_param * l1_loss
return loss, mse_loss, l1_loss
else:
return mse_loss
So your validation loss is just MSE, but training is MSE + regularization, so obviously your train loss will be higher. You should log just train MSE without regulariser if you want to compare them.
Also, do not start with regularisation, always start witha model with no regularisation at all and get training to converge. Remove all extra losses, remove your dropouts. These things only harm your ability to learn (but might improve generalisation). Once this is achieved - reintroduce them one at a time.

TQDM does not update epochs

for epoch in range(args.num_epochs):
model.train()
# print(f"Epoch {epoch}")
with tqdm(total=len(input_tensor_catted), unit="ba") as pbar:
pbar.set_description(f"Epoch {epoch}")
pbar.update(1)
# for step, batch in enumerate(train_dataloader):
for step in range(len(input_tensor_catted) // args.batch_size):
indices = torch.multinomial(torch.ones(len(input_tensor_catted)) / len(input_tensor_catted), args.batch_size, replacement=True)
clean_inputs = input_tensor_catted[indices, :]
clean_conditioning = original_cost_tensor_catted[indices, :].to(clean_inputs.device)
# clean_inputs = batch["input"]
noise_samples = torch.randn(clean_inputs.shape).to(clean_inputs.device)
bsz = clean_inputs.shape[0]
timesteps = torch.randint(0, noise_scheduler.timesteps, (bsz,), device=clean_inputs.device).long()
# add noise onto the clean images according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_images = noise_scheduler.training_step(clean_inputs, noise_samples, timesteps)
if step % args.gradient_accumulation_steps != 0:
with accelerator.no_sync(model):
# from noisy images, predict epsilon
output = model(noisy_images, timesteps, clean_conditioning)
# predict the noise residual
loss = F.mse_loss(output, noise_samples)
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
else:
output = model(noisy_images, timesteps, clean_conditioning)
# predict the noise residual
loss = F.mse_loss(output, noise_samples)
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
pbar.update(1)
pbar.set_postfix(loss=loss.detach().item(), lr=optimizer.param_groups[0]["lr"])
optimizer.step()
This is my code.
This is an example of what is printed to the console:
The point is, the top counter (parallel with Epoch 1) only upates from 1/10000 to 10/10000 and always stops, even if the Epoch is greater than 10.
You can do something like this every epoch
for epoch in range(10):
trainBar = tqdm.tqdm(trainData)
valBar = tqdm.tqdm(valData)
for batch, data in enumerate(trainBar):
do smthing
for batch, data in enumerate(valBar):
do smthing
This will create a new tqdm bar each epoch and you don't have to worry about resetting it.

Accuracy value goes up and down on the training process

After training the network I noticed that accuracy goes up and down. Initially I thought it is caused by the learning rate, but it is set to quite small value. Please check the screenshot attached.
Plot Accuracy Screenshot
My network (in Pytorch) looks as follow:
class Network(nn.Module):
def __init__(self):
super(Network,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
out = torch.sigmoid(out)
return out
I am using RMSprop as optimizer and BCELoss as criterion. The learning rate is set to 0.001
Here is the training process:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
if itr%p_itr == 0:
pred = torch.argmax(output, dim=1)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
My dataset is quite small - 2000 train and 1000 validation (binary classification 0/1). I wanted to do the 80/20 split but I was asked to keep it like that. I was thinking that the architecture might be too complex for such a small dataset.
Any hits what may cause such jumps in the training process?
Your code here is wrong: pred = torch.argmax(output, dim=1)
This line using for multiclass classification with Cross-Entropy Loss.
Your task is binary classification so the pred values are wrong. Change to:
if itr%p_itr == 0:
pred = torch.round(output)
....
You can change your optimizer to Adam, SGD, or RMSprop to find the suitable optimizer that helps your model coverage faster.
Also change the forward() function:
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return self.sigmoid(out) #use your forward is ok, but this cleaner

Why does the same PyTorch code (different implementation) give different loss?

I was tackling the Fashion MNIST data-set problem on Udacity. However my implementation of code is giving drastically different loss as compared to the solution shared by the Udacity team. I believe the only difference in my answer is the definition of the Neural Network and apart from that everything is the same. I am not able to figure out the reason for such a drastic difference in Loss.
Code 1: My solution:
import torch.nn as nn
from torch import optim
images, labels = next(iter(trainloader))
model = nn.Sequential(nn.Linear(784,256),
nn.ReLU(),
nn.Linear(256,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,10),
nn.LogSoftmax(dim=1))
# Flatten images
optimizer = optim.Adam(model.parameters(),lr=0.003)
criterion = nn.NLLLoss()
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 4000
Code 2: Official Solution:
from torch import nn, optim
import torch.nn.functional as F
class Classifier(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 64)
self.fc4 = nn.Linear(64, 10)
def forward(self, x):
# make sure input tensor is flattened
x = x.view(x.shape[0], -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.log_softmax(self.fc4(x), dim=1)
return x
model = Classifier()
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
epochs = 5
for e in range(epochs):
running_loss = 0
for images, labels in trainloader:
log_ps = model(images)
loss = criterion(log_ps, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 200
Is there any explanation for the vast difference in loss ?
You forgot to zero out/clear the gradients in your implementation. That is you are missing :
optimizer.zero_grad()
In other words simply do:
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
# missed this!
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
and you are good to go!

Training Loss When Resuming From a Checkpoint Explodes

I am trying to implement a function in my algorithm which allows me to resume training from a checkpoint. The problem is that when I resume training, my loss explodes by many orders of magnitude, from the order to 0.001 to 1000. I suspect that the problem may be that when training is resumed, the learning rate is not being set properly.
Here is my training function:
def train_gray(epoch, data_loader, device, model, criterion, optimizer, i, path):
train_loss = 0.0
for data in data_loader:
img, _ = data
img = img.to(device)
stand_dev = 0.0392
noisy_img = add_noise(img, stand_dev, device)
output = model(noisy_img, stand_dev)
output = output[:,0:1,:,:]
loss = criterion(output, img)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()*img.size(0)
train_loss = train_loss/len(data_loader)
print('Epoch: {} Complete \tTraining Loss: {:.6f}'.format(
epoch,
train_loss
))
return train_loss
And here is my main function that initialises my variables, loads a checkpoint, calls my training function, and saves a checkpoint after an epoch of training:
def main():
now = datetime.now()
current_time = now.strftime("%H_%M_%S")
path = "/home/bledc/my_remote_folder/denoiser/models/{}_sigma_10_session2".format(current_time)
os.mkdir(path)
width = 256
# height = 256
num_epochs = 25
batch_size = 4
learning_rate = 0.0001
data_loader = load_dataset(batch_size, width)
model = UNetWithResnet50Encoder().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
model.parameters(), lr=learning_rate, weight_decay=1e-5)
############################################################################################
# UNCOMMENT CODE BELOW TO RESUME TRAINING FROM A MODEL
model_path = "/home/bledc/my_remote_folder/denoiser/models/resnet_sigma_10/model_epoch_10.pt"
save_point = torch.load(model_path)
model.load_state_dict(save_point['model_state_dict'])
optimizer.load_state_dict(save_point['optimizer_state_dict'])
epoch = save_point['epoch']
train_loss = save_point['train_loss']
model.train()
############################################################################################
for i in range(epoch, num_epochs+1):
train_loss = train_gray(i, data_loader, device, model, criterion, optimizer, i, path)
checkpoint(i, train_loss, model, optimizer, path)
print("end")
Lastly, here is my function to save checkpoints:
def checkpoint(epoch, train_loss, model, optimizer, path):
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_loss
}, path+"/model_epoch_{}.pt".format(epoch))
print("Epoch saved")
If my problem is that I am not saving my learning rate, how would I do this?
Any help would be greatly appreciated,
Clement
Update: I'm fairly certain that the problem lies in my pretrained model. I am saving the optimiser every epoch but the optimiser only holds information for the trainable layers. I hope to solve this soon and post a more thorough answer when I figure out who to save and load the entire model.

Resources