Pytorch model loss and accuracy remain constant - machine-learning

I need to create a model that takes as input a 351x351x11 Tensor and gives as output a 351x351x11 Tensor (it is an Autoencoder). The two tensors are made of 0s and 1s.
This is the model:
class AutoEncoder(nn.Module):
def __init__(self):
super(AutoEncoder, self).__init__()
self.down_layers=nn.ModuleList()
self.up_layers=nn.ModuleList()
self.n_layers = 1
self.down_layers.append(nn.Conv3d(5,1,(3,3,1)))
self.up_layers.append(nn.ConvTranspose3d(1,5,(3,3,1)))
for d_l in self.down_layers:
torch.nn.init.normal_(d_l.weight, mean=0.5, std=0.7)
for u_l in self.up_layers:
torch.nn.init.normal_(u_l.weight, mean=0.5, std=0.7)
def encode(self, x):
# Encoder
for i in range(len(self.down_layers)):
x = self.down_layers[i](x)
x = torch.sigmoid(x)
return x
def forward(self, x):
# Decoder
x = self.encode(x)
for i in range(len(self.up_layers)):
x = self.up_layers[i](x)
x = torch.sigmoid(x)
if(i==(len(self.up_layers)-1)):
x = torch.round(x)
return x
This is the training function:
max_e,max_p = 351,11 #tensor dimensions
DEVICE = get_device() #device is cpu
EPOCHS = 100
BATCHSIZE=5
try:
print("Start model",flush=True)
# Generate the model.
model = AutoEncoder().to(DEVICE)
lr = 0.09
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
# Training of the model.
for epoch in range(EPOCHS):
model.train()
"""
I have to create 25 dataloaders for 50000 training samples (each of 2000 samples) to avoid memory congestion.
"""
for i in range(25):
train_loader,X_train_shape=get_dataset(i)
N_TRAIN_EXAMPLES = X_train_shape
for batch_idx, (data, target) in enumerate(train_loader):
if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
break
data, target = data[None, ...].to(DEVICE, dtype=torch.float), target[None, ...].to(DEVICE, dtype=torch.float)
optimizer.zero_grad()
output = model(data)
loss = torch.nn.BCELoss()
loss = loss(output, target)
loss.backward()
optimizer.step()
#remove train data loader from memory
del train_loader
print("VALIDATION",flush=True)
# Validation of the model.
model.eval()
correct = 0
tot = 0
with torch.no_grad():
"""
Same with the training, 10 data loaders for 20000 samples
"""
for i in range(25,35):
valid_loader,X_valid_shape=get_dataset(i)
N_VALID_EXAMPLES = X_valid_shape
for batch_idx, (data, target) in enumerate(valid_loader):
# Limiting validation data.
if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
break
data, target = data[None, ...].to(DEVICE, dtype=torch.float), target[None, ...].to(DEVICE, dtype=torch.float)
output = model(data)
# count the number of 1s and 0s predicted correctly
newCorrect= output(target.view_as(output)).sum().item()
correct += newCorrect
tot +=max_e*max_e*max_p*BATCHSIZE
del valid_loader
accuracy = correct*100 / tot
print('Epoch: {} Loss: {} Accuracy: {} %'.format(epoch, loss.data, accuracy),flush=True)
the function that returns the data loader is:
def get_dataset(i):
X_train=[]
Y_train=[]
for j in range(i*2000,(i+1)*2000):
t = torch.load("/home/ubuntu/data/home/ubuntu/deeplogic/el_dataset/x/scene{}.pt".format(j))
X_train.append(t)
t = torch.load("/home/ubuntu/data/home/ubuntu/deeplogic/el_dataset/y/scene{}.pt".format(j))
Y_train.append(t)
train_x = torch.from_numpy(np.array(X_train)).float()
train_y = torch.from_numpy(np.array(Y_train)).float()
batch_size = 1
train = torch.utils.data.TensorDataset(train_x,train_y)
# data loader
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = True)
return train_loader,len(X_train)
The prints I got are :
Epoch: 1 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %
Epoch: 2 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %
Epoch: 3 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %
Epoch: 4 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %

x = torch.round(x) prevents you from updating your model because it's non-differentiable. More importantly, x = torch.round(x) is redundant for BCELoss. You should move it validation step only. Also, the newCorrect in your validation loop does not compare with target values. (I add the missing eq() in your code.)
# in validation loop
preds = torch.round(output)
newCorrect= preds.eq(target.view_as(preds)).sum().item()

Related

Loss Not Decreasing pytorch

I am making a simple CNN without a pooling layer and optimizer (not allowed since its a college assignment). I am using the SVHN dataset. I am using crossentropy loss but loss keeps jumping between 2.305 and 2.306 in 10 epochs. Kindly help.
class Net(nn.Module):
def __init__(self):
super(Net,self).__init__()
self.conv1 = nn.Conv2d(3, 32,kernel_size=3, padding=1)
# self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.fc1 = nn.Sequential(
nn.Flatten(),
nn.Linear(64*32*32, 10)
)
def forward(self, x):
x = (F.relu(self.conv1(x)))
x = (F.relu(self.conv2(x)))
x = self.fc1(x)
return x
net = Net()
criterion = nn.CrossEntropyLoss()
for epoch in range(10): # loop over the dataset multiple times
net.train()
batch_loss_val=0
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# forward + backward
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
# optimizer.step()
# print statistics
running_loss += loss.item()
if i % 50 == 0: # print every 50 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 50 :.3f}')
running_loss = 0.0
print('Finished Training')
You need to enable optimizer.step() to update the weights. loss.backward() is only computing the gradients.

Resuming Training PyTorch

I'm attempting to save and load best model through torch, where I've defined my training function as follows:
def train_model(model, train_loader, test_loader, device, learning_rate=1e-1, num_epochs=200):
# The training configurations were not carefully selected.
criterion = nn.CrossEntropyLoss()
model.to(device)
# It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=500)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[65, 75], gamma=0.75, last_epoch=-1)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
print("Epoch: {:02d} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(-1, eval_loss, eval_accuracy))
load_model = input('Load a model?')
for epoch in range(num_epochs):
if epoch//2 == 0:
write_checkpoint(model=model, epoch=epoch, scheduler=scheduler, optimizer=optimizer)
model, optimizer, epoch, scheduler = load_checkpoint(model=model, scheduler=scheduler, optimizer=optimizer)
for state in optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.to(device)
# Training
model.train()
running_loss = 0
running_corrects = 0
for inputs, labels in train_loader:
inputs = torch.FloatTensor(inputs)
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
train_loss = running_loss / len(train_loader.dataset)
train_accuracy = running_corrects / len(train_loader.dataset)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
# Set learning rate scheduler
scheduler.step()
print("Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(epoch, train_loss, train_accuracy, eval_loss, eval_accuracy))
return model
Where I'd like to be able to load a model, and start training from the epoch where model was saved.
So far I have methods to save model, optimizer,scheduler states and the epoch via
def write_checkpoint(model, optimizer, epoch, scheduler):
state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }
filename = '/content/model_'
torch.save(state, filename + f'CP_epoch{epoch + 1}.pth')
def load_checkpoint(model, optimizer, scheduler, filename='/content/checkpoint.pth'):
# Note: Input model & optimizer should be pre-defined. This routine only updates their states.
start_epoch = 0
if os.path.isfile(filename):
print("=> loading checkpoint '{}'".format(filename))
checkpoint = torch.load(filename)
start_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
scheduler = checkpoint['scheduler']
print("=> loaded checkpoint '{}' (epoch {})"
.format(filename, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(filename))
return model, optimizer, start_epoch, scheduler
But I can't seem to come up with the logic of how I'd update the epoch to start at the correct one. Looking for hints or ideas on how to implement just that.
If I understand correctly you trying to resume training from last progress with correct epoch number.
Before calling train_model load the checkpoint values including start_epoch. Then use start_epoch as loop starting point,
for epoch in range(start_epoch, num_epochs):

tgt and src have to have equal features for a Transformer Network in Pytorch

I am attempting to train EEG data through a transformer network. The input dimensions are 50x16684x60 (seq x batch x features) and the output is 16684x2. Right now I am simply trying to run a basic transformer, and I keep getting an error telling me
RuntimeError: the feature number of src and tgt must be equal to d_model
Why would the source and target feature number ever be equal? Is it possible to run such a dataset through a transformer?
Here is my basic model:
input_size = 60 # seq x batch x features
hidden_size = 32
num_classes = 2
learning_rate = 0.001
batch_size = 64
num_epochs = 2
sequence_length = 50
num_layers = 2
dropout = 0.5
class Transformer(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(Transformer, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.transformer = nn.Transformer(60, 2)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x, y):
# Forward Propogation
out, _ = self.transformer(x,y)
out = out.reshape(out.shape[0], -1)
out = self.fc(out)
return out
model = Transformer(input_size, hidden_size, num_layers, num_classes)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for index in tqdm(range(16684)):
X, y = (X_train[index], Y_train[index])
print(X.shape, y.shape)
output = model(X, y)
loss = criterion(output, y)
model.zero_grad()
loss.backward()
optimizer.step()
if index % 500 == 0:
print(f"Epoch {epoch}, Batch: {index}, Loss: {loss}")
You train the model to find some features by feeding it the input sequence and desired sequence. The backprop trains the net by computing the loss as a "difference" between src and target features.
If the features sizes aren't the same - the backprop can't find the accordance to some desired feature and the model can't be trained.

Facing this error while classifying Images, containing 10 classes in pytorch, in ResNet50. My code is:

This is the code I am implementing: I am using a subset of the CalTech256 dataset to classify images of 10 different kinds of animals. We will go over the dataset preparation, data augmentation and then steps to build the classifier.
def train_and_validate(model, loss_criterion, optimizer, epochs=25):
'''
Function to train and validate
Parameters
:param model: Model to train and validate
:param loss_criterion: Loss Criterion to minimize
:param optimizer: Optimizer for computing gradients
:param epochs: Number of epochs (default=25)
Returns
model: Trained Model with best validation accuracy
history: (dict object): Having training loss, accuracy and validation loss, accuracy
'''
start = time.time()
history = []
best_acc = 0.0
for epoch in range(epochs):
epoch_start = time.time()
print("Epoch: {}/{}".format(epoch+1, epochs))
# Set to training mode
model.train()
# Loss and Accuracy within the epoch
train_loss = 0.0
train_acc = 0.0
valid_loss = 0.0
valid_acc = 0.0
for i, (inputs, labels) in enumerate(train_data_loader):
inputs = inputs.to(device)
labels = labels.to(device)
# Clean existing gradients
optimizer.zero_grad()
# Forward pass - compute outputs on input data using the model
outputs = model(inputs)
# Compute loss
loss = loss_criterion(outputs, labels)
# Backpropagate the gradients
loss.backward()
# Update the parameters
optimizer.step()
# Compute the total loss for the batch and add it to train_loss
train_loss += loss.item() * inputs.size(0)
# Compute the accuracy
ret, predictions = torch.max(outputs.data, 1)
correct_counts = predictions.eq(labels.data.view_as(predictions))
# Convert correct_counts to float and then compute the mean
acc = torch.mean(correct_counts.type(torch.FloatTensor))
# Compute total accuracy in the whole batch and add to train_acc
train_acc += acc.item() * inputs.size(0)
#print("Batch number: {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}".format(i, loss.item(), acc.item()))
# Validation - No gradient tracking needed
with torch.no_grad():
# Set to evaluation mode
model.eval()
# Validation loop
for j, (inputs, labels) in enumerate(valid_data_loader):
inputs = inputs.to(device)
labels = labels.to(device)
# Forward pass - compute outputs on input data using the model
outputs = model(inputs)
# Compute loss
loss = loss_criterion(outputs, labels)
# Compute the total loss for the batch and add it to valid_loss
valid_loss += loss.item() * inputs.size(0)
# Calculate validation accuracy
ret, predictions = torch.max(outputs.data, 1)
correct_counts = predictions.eq(labels.data.view_as(predictions))
# Convert correct_counts to float and then compute the mean
acc = torch.mean(correct_counts.type(torch.FloatTensor))
# Compute total accuracy in the whole batch and add to valid_acc
valid_acc += acc.item() * inputs.size(0)
#print("Validation Batch number: {:03d}, Validation: Loss: {:.4f}, Accuracy: {:.4f}".format(j, loss.item(), acc.item()))
# Find average training loss and training accuracy
avg_train_loss = train_loss/train_data_size
avg_train_acc = train_acc/train_data_size
# Find average training loss and training accuracy
avg_valid_loss = valid_loss/valid_data_size
avg_valid_acc = valid_acc/valid_data_size
history.append([avg_train_loss, avg_valid_loss, avg_train_acc, avg_valid_acc])
epoch_end = time.time()
print("Epoch : {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}%, \n\t\tValidation : Loss : {:.4f}, Accuracy: {:.4f}%, Time: {:.4f}s".format(epoch, avg_train_loss, avg_train_acc*100, avg_valid_loss, avg_valid_acc*100, epoch_end-epoch_start))
# Save if the model has best accuracy till now
torch.save(model, dataset+'_model_'+str(epoch)+'.pt')
return model, history
# Load pretrained ResNet50 Model
resnet50 = models.resnet50(pretrained=True)
#resnet50 = resnet50.to('cuda:0')
# Freeze model parameters
for param in resnet50.parameters():
param.requires_grad = False
# Change the final layer of ResNet50 Model for Transfer Learning
fc_inputs = resnet50.fc.in_features
resnet50.fc = nn.Sequential(
nn.Linear(fc_inputs, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, num_classes), # Since 10 possible outputs
nn.LogSoftmax(dim=1) # For using NLLLoss()
)
# Convert model to be used on GPU
# resnet50 = resnet50.to('cuda:0')
# Change the final layer of ResNet50 Model for Transfer Learning
fc_inputs = resnet50.fc.in_features
resnet50.fc = nn.Sequential(
nn.Linear(fc_inputs, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, num_classes), # Since 10 possible outputs
nn.LogSoftmax(dienter code herem=1) # For using NLLLoss()
)
# Convert model to be used on GPU
# resnet50 = resnet50.to('cuda:0')`enter code here`
Error is this:
RuntimeError Traceback (most recent call
last) in ()
6 # Train the model for 25 epochs
7 num_epochs = 30
----> 8 trained_model, history = train_and_validate(resnet50, loss_func, optimizer, num_epochs)
9
10 torch.save(history, dataset+'_history.pt')
in train_and_validate(model,
loss_criterion, optimizer, epochs)
43
44 # Compute loss
---> 45 loss = loss_criterion(outputs, labels)
46
47 # Backpropagate the gradients
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in
call(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\loss.py in
forward(self, input, target)
202
203 def forward(self, input, target):
--> 204 return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
205
206
~\Anaconda3\lib\site-packages\torch\nn\functional.py in
nll_loss(input, target, weight, size_average, ignore_index, reduce,
reduction) 1836 .format(input.size(0),
target.size(0))) 1837 if dim == 2:
-> 1838 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index) 1839 elif dim == 4: 1840 ret = torch._C._nn.nll_loss2d(input, target,
weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes'
failed. at
C:\Users\builder\AppData\Local\Temp\pip-req-build-0i480kur\aten\src\THNN/generic/ClassNLLCriterion.c:97
This happens when there are either incorrect labels in your dataset, or the labels are 1-indexed (instead of 0-indexed). As from the error message, cur_target must be smaller than the total number of classes (10). To verify the issue, check the maximum and minimum label in your dataset. If the data is indeed 1-indexed, just minus one from all annotations and you should be fine.
Note, another possible reason is that there exists some -1 labels in the data. Some (esp older) datasets use -1 as indication of a wrong/dubious label. If you find such labels, just discard them.

System hangs after first epoch training in pytorch

So, I was trying to train on ResNet model in PyTorch using the ImageNet example in the GitHub repository.
Here's what my train method looks like (it is almost similar to that in example)
def train(train_loader, model, criterion, optimizer, epoch):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
args = get_args()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
print(i)
# data loading time
data_time.update(time.time() - end)
if cuda:
target = target.cuda(async = True)
input_var = torch.autograd.Variable(input).cuda()
else:
input_var = torch.autograd.Variable(input)
target_var = torch.autograd.Variable(target)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(prec1.item(), input.size(0))
# top5.update(prec5.item(), input.size(0))
# compute gradient and do optimizer step
optimizer.zero_grad()
loss.backward()
optimizer.step()
#measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
# print to console and write logs to tensorboard
if i % args.print_freq == 0:
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'Prec#1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5))
niter = epoch * len(train_loader) + i
# writer.add_scalar('Train/Loss', losses.val, niter)
# writer.add_scalar('Train/Prec#1', top1.val, niter)
# writer.add_scalar('Train/Prec#5', top5.val, niter)
System Information:
GPU: Nvidia Titan XP
Memory: 32 Gb
PyTorch: 0.4.0
When I run this code, training starts with epoch 0
Epoch: [0][0/108] Time 5.644 (5.644) Data 1.929 (1.929) Loss 6.9052 (6.9052) Prec#1 0.000 (0.000)
And then the remote server automatically disconnects. It happened five times.
And this is the data loader:
#Load the Data --> TRAIN
traindir = 'train'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(traindir, transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
pin_memory=cuda
)
# Load the data --> Validation
valdir = 'valid'
valid_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers,
pin_memory=cuda
)
if args.evaluate:
validate(valid_loader, model, criterion, epoch=0)
return
# Start
for epoch in range(args.start_epoch, args.epochs):
adjust_learning_rate(optimizer, epoch)
# train for epoch
train(train_loader, model, criterion, optimizer, epoch)
# evaluate on valid
prec1 = validate(valid_loader, model, criterion, epoch)
# remember best prec1 and save checkpoint
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
'optimizer': optimizer.state_dict()
}, is_best)
With this params for the loader:
args.num_workers = 4
args.batch_size = 32
pin_memory = torch.cuda.is_available()
Is there something wrong in my approach?
seems a bug in pytorch's dataloader.
try args.num_workers = 0

Resources