TQDM does not update epochs - tqdm

for epoch in range(args.num_epochs):
model.train()
# print(f"Epoch {epoch}")
with tqdm(total=len(input_tensor_catted), unit="ba") as pbar:
pbar.set_description(f"Epoch {epoch}")
pbar.update(1)
# for step, batch in enumerate(train_dataloader):
for step in range(len(input_tensor_catted) // args.batch_size):
indices = torch.multinomial(torch.ones(len(input_tensor_catted)) / len(input_tensor_catted), args.batch_size, replacement=True)
clean_inputs = input_tensor_catted[indices, :]
clean_conditioning = original_cost_tensor_catted[indices, :].to(clean_inputs.device)
# clean_inputs = batch["input"]
noise_samples = torch.randn(clean_inputs.shape).to(clean_inputs.device)
bsz = clean_inputs.shape[0]
timesteps = torch.randint(0, noise_scheduler.timesteps, (bsz,), device=clean_inputs.device).long()
# add noise onto the clean images according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_images = noise_scheduler.training_step(clean_inputs, noise_samples, timesteps)
if step % args.gradient_accumulation_steps != 0:
with accelerator.no_sync(model):
# from noisy images, predict epsilon
output = model(noisy_images, timesteps, clean_conditioning)
# predict the noise residual
loss = F.mse_loss(output, noise_samples)
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
else:
output = model(noisy_images, timesteps, clean_conditioning)
# predict the noise residual
loss = F.mse_loss(output, noise_samples)
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
pbar.update(1)
pbar.set_postfix(loss=loss.detach().item(), lr=optimizer.param_groups[0]["lr"])
optimizer.step()
This is my code.
This is an example of what is printed to the console:
The point is, the top counter (parallel with Epoch 1) only upates from 1/10000 to 10/10000 and always stops, even if the Epoch is greater than 10.

You can do something like this every epoch
for epoch in range(10):
trainBar = tqdm.tqdm(trainData)
valBar = tqdm.tqdm(valData)
for batch, data in enumerate(trainBar):
do smthing
for batch, data in enumerate(valBar):
do smthing
This will create a new tqdm bar each epoch and you don't have to worry about resetting it.

Related

"RuntimeError: Expected target size" error for the nn.CrossEntropyLoss() function

I am trying to train a GPT-2 model to take in a tokenized/padded input and predict the output. My batch size is 32. My max length is 343. I believe that the 768 comes from the model. I cannot get the loss function to work properly though. The training loop keeps throwing me errors like this:
RuntimeError: Expected target size [32, 768], got [32, 343]
# Create a TensorDataset from input_ids and output_ids
dataset = TensorDataset(input_tensors, output_tensors)
#Constants
batch_size = 32
num_epochs = 20
# Create a DataLoader from the dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Set the device to run on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define the model architecture
model = transformers.GPT2Model.from_pretrained('gpt2').to(device)
# Define the loss function
loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='mean')
# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Set the model to training mode
model.train()
print(f"input_tensors.shape before the loop: {input_tensors.shape}")
print(f"output_tensors.shape before the loop: {output_tensors.shape}")
# Loop over the number of epochs
for epoch in range(num_epochs):
# Initialize the epoch loss
epoch_loss = 0
# Loop over the data in the dataloader
for input_tensors, output_tensors in dataloader:
# Send the input and target tensors to the device
input_tensors = input_tensors.to(device)
output_tensors = output_tensors.type(torch.LongTensor)
output_tensors = output_tensors.to(device)
# Zero gradients
optimizer.zero_grad()
# Begin Forward pass
logits = model(input_tensors)[0]
print(f"logits.shape: {logits.shape}")
print(f"input_tensors.shape: {input_tensors.shape}")
print(f"output_tensors.shape: {output_tensors.shape}")
# Compute the loss
loss = loss_function(logits, output_tensors)
# Backward pass
loss.backward()
# Update the model parameters
optimizer.step()
# Add the loss to the epoch loss
epoch_loss += loss.item()
# Print the epoch loss
print(f'Epoch {epoch+1}: Loss = {epoch_loss}')
And the sizes of the tensors:
input_tensors.shape == torch.Size([2625, 343]) before the loop
output_tensors.shape == torch.Size([2625, 343]) before the loop
logits.shape == torch.Size([32, 343, 768])
input_tensors.shape == torch.Size([32, 343])
output_tensors.shape == torch.Size([32, 343])
I have tried squeezing/unsqueezing and changing the shape of the logits/output_tensors shape. I think that's the right next step but I can't figure out what to change exactly.

Training Loss When Resuming From a Checkpoint Explodes

I am trying to implement a function in my algorithm which allows me to resume training from a checkpoint. The problem is that when I resume training, my loss explodes by many orders of magnitude, from the order to 0.001 to 1000. I suspect that the problem may be that when training is resumed, the learning rate is not being set properly.
Here is my training function:
def train_gray(epoch, data_loader, device, model, criterion, optimizer, i, path):
train_loss = 0.0
for data in data_loader:
img, _ = data
img = img.to(device)
stand_dev = 0.0392
noisy_img = add_noise(img, stand_dev, device)
output = model(noisy_img, stand_dev)
output = output[:,0:1,:,:]
loss = criterion(output, img)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()*img.size(0)
train_loss = train_loss/len(data_loader)
print('Epoch: {} Complete \tTraining Loss: {:.6f}'.format(
epoch,
train_loss
))
return train_loss
And here is my main function that initialises my variables, loads a checkpoint, calls my training function, and saves a checkpoint after an epoch of training:
def main():
now = datetime.now()
current_time = now.strftime("%H_%M_%S")
path = "/home/bledc/my_remote_folder/denoiser/models/{}_sigma_10_session2".format(current_time)
os.mkdir(path)
width = 256
# height = 256
num_epochs = 25
batch_size = 4
learning_rate = 0.0001
data_loader = load_dataset(batch_size, width)
model = UNetWithResnet50Encoder().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
model.parameters(), lr=learning_rate, weight_decay=1e-5)
############################################################################################
# UNCOMMENT CODE BELOW TO RESUME TRAINING FROM A MODEL
model_path = "/home/bledc/my_remote_folder/denoiser/models/resnet_sigma_10/model_epoch_10.pt"
save_point = torch.load(model_path)
model.load_state_dict(save_point['model_state_dict'])
optimizer.load_state_dict(save_point['optimizer_state_dict'])
epoch = save_point['epoch']
train_loss = save_point['train_loss']
model.train()
############################################################################################
for i in range(epoch, num_epochs+1):
train_loss = train_gray(i, data_loader, device, model, criterion, optimizer, i, path)
checkpoint(i, train_loss, model, optimizer, path)
print("end")
Lastly, here is my function to save checkpoints:
def checkpoint(epoch, train_loss, model, optimizer, path):
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_loss
}, path+"/model_epoch_{}.pt".format(epoch))
print("Epoch saved")
If my problem is that I am not saving my learning rate, how would I do this?
Any help would be greatly appreciated,
Clement
Update: I'm fairly certain that the problem lies in my pretrained model. I am saving the optimiser every epoch but the optimiser only holds information for the trainable layers. I hope to solve this soon and post a more thorough answer when I figure out who to save and load the entire model.

Accuracy changes drastically on changing from CPU to GPU

So I was working on the SIIM Melanoma classification data on Kaggle. While training a number of networks I found that when I trained them on CPU, accuracy seemed to be appropriate, around 0.75. On switching to GPU, the accuracy would oscillate in and around 0.5. What should I do about this? Here's a code snippet of the training loop. Model trained finally was resnext50.
import cv2
device = "cpu"
import torch.nn.functional as F
epochs=3
#model = torch.load("model.pt")
model.cpu()
#model.cuda()
print("======== Training for ", epochs, "epochs=============")
for epoch in range(epochs):
total_loss = 0
model.train()
print("Training.......")
print("======== EPOCH #",epoch,"=================")
tmp_acc = 0
for i,batch in enumerate(train_loader):
img,label = batch["images"],batch["labels"]
#img = img.permute(0,3,1,2)
#img = torch.Tensor(img)
label = label.type(torch.FloatTensor)
img,label = img.to(device),label.to(device)
model.zero_grad()
op = model(img)
label_cpu = label.cpu().numpy()
op = F.sigmoid(op)
output = op.detach().cpu().numpy()
tmp_acc += accuracy_score(output,label_cpu)
loss = criterion(op,label)
total_loss = loss.item()
loss.backward()
adam.step()
if(i%10==0 and i>0):
print("STEP: ",i, "of steps ",len(train_loader))
print("Current loss: ",total_loss/i)
print("Training Accuracy ",tmp_acc/i)
avg_loss = total_loss/len(train_loader)
print("The loss after ",epoch," epochs is ",avg_loss)
print("OP",op)
print("Label",label_cpu)
torch.save(model.state_dict(),"/kaggle/working/model.pt")

Facing this error while classifying Images, containing 10 classes in pytorch, in ResNet50. My code is:

This is the code I am implementing: I am using a subset of the CalTech256 dataset to classify images of 10 different kinds of animals. We will go over the dataset preparation, data augmentation and then steps to build the classifier.
def train_and_validate(model, loss_criterion, optimizer, epochs=25):
'''
Function to train and validate
Parameters
:param model: Model to train and validate
:param loss_criterion: Loss Criterion to minimize
:param optimizer: Optimizer for computing gradients
:param epochs: Number of epochs (default=25)
Returns
model: Trained Model with best validation accuracy
history: (dict object): Having training loss, accuracy and validation loss, accuracy
'''
start = time.time()
history = []
best_acc = 0.0
for epoch in range(epochs):
epoch_start = time.time()
print("Epoch: {}/{}".format(epoch+1, epochs))
# Set to training mode
model.train()
# Loss and Accuracy within the epoch
train_loss = 0.0
train_acc = 0.0
valid_loss = 0.0
valid_acc = 0.0
for i, (inputs, labels) in enumerate(train_data_loader):
inputs = inputs.to(device)
labels = labels.to(device)
# Clean existing gradients
optimizer.zero_grad()
# Forward pass - compute outputs on input data using the model
outputs = model(inputs)
# Compute loss
loss = loss_criterion(outputs, labels)
# Backpropagate the gradients
loss.backward()
# Update the parameters
optimizer.step()
# Compute the total loss for the batch and add it to train_loss
train_loss += loss.item() * inputs.size(0)
# Compute the accuracy
ret, predictions = torch.max(outputs.data, 1)
correct_counts = predictions.eq(labels.data.view_as(predictions))
# Convert correct_counts to float and then compute the mean
acc = torch.mean(correct_counts.type(torch.FloatTensor))
# Compute total accuracy in the whole batch and add to train_acc
train_acc += acc.item() * inputs.size(0)
#print("Batch number: {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}".format(i, loss.item(), acc.item()))
# Validation - No gradient tracking needed
with torch.no_grad():
# Set to evaluation mode
model.eval()
# Validation loop
for j, (inputs, labels) in enumerate(valid_data_loader):
inputs = inputs.to(device)
labels = labels.to(device)
# Forward pass - compute outputs on input data using the model
outputs = model(inputs)
# Compute loss
loss = loss_criterion(outputs, labels)
# Compute the total loss for the batch and add it to valid_loss
valid_loss += loss.item() * inputs.size(0)
# Calculate validation accuracy
ret, predictions = torch.max(outputs.data, 1)
correct_counts = predictions.eq(labels.data.view_as(predictions))
# Convert correct_counts to float and then compute the mean
acc = torch.mean(correct_counts.type(torch.FloatTensor))
# Compute total accuracy in the whole batch and add to valid_acc
valid_acc += acc.item() * inputs.size(0)
#print("Validation Batch number: {:03d}, Validation: Loss: {:.4f}, Accuracy: {:.4f}".format(j, loss.item(), acc.item()))
# Find average training loss and training accuracy
avg_train_loss = train_loss/train_data_size
avg_train_acc = train_acc/train_data_size
# Find average training loss and training accuracy
avg_valid_loss = valid_loss/valid_data_size
avg_valid_acc = valid_acc/valid_data_size
history.append([avg_train_loss, avg_valid_loss, avg_train_acc, avg_valid_acc])
epoch_end = time.time()
print("Epoch : {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}%, \n\t\tValidation : Loss : {:.4f}, Accuracy: {:.4f}%, Time: {:.4f}s".format(epoch, avg_train_loss, avg_train_acc*100, avg_valid_loss, avg_valid_acc*100, epoch_end-epoch_start))
# Save if the model has best accuracy till now
torch.save(model, dataset+'_model_'+str(epoch)+'.pt')
return model, history
# Load pretrained ResNet50 Model
resnet50 = models.resnet50(pretrained=True)
#resnet50 = resnet50.to('cuda:0')
# Freeze model parameters
for param in resnet50.parameters():
param.requires_grad = False
# Change the final layer of ResNet50 Model for Transfer Learning
fc_inputs = resnet50.fc.in_features
resnet50.fc = nn.Sequential(
nn.Linear(fc_inputs, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, num_classes), # Since 10 possible outputs
nn.LogSoftmax(dim=1) # For using NLLLoss()
)
# Convert model to be used on GPU
# resnet50 = resnet50.to('cuda:0')
# Change the final layer of ResNet50 Model for Transfer Learning
fc_inputs = resnet50.fc.in_features
resnet50.fc = nn.Sequential(
nn.Linear(fc_inputs, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, num_classes), # Since 10 possible outputs
nn.LogSoftmax(dienter code herem=1) # For using NLLLoss()
)
# Convert model to be used on GPU
# resnet50 = resnet50.to('cuda:0')`enter code here`
Error is this:
RuntimeError Traceback (most recent call
last) in ()
6 # Train the model for 25 epochs
7 num_epochs = 30
----> 8 trained_model, history = train_and_validate(resnet50, loss_func, optimizer, num_epochs)
9
10 torch.save(history, dataset+'_history.pt')
in train_and_validate(model,
loss_criterion, optimizer, epochs)
43
44 # Compute loss
---> 45 loss = loss_criterion(outputs, labels)
46
47 # Backpropagate the gradients
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in
call(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\loss.py in
forward(self, input, target)
202
203 def forward(self, input, target):
--> 204 return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
205
206
~\Anaconda3\lib\site-packages\torch\nn\functional.py in
nll_loss(input, target, weight, size_average, ignore_index, reduce,
reduction) 1836 .format(input.size(0),
target.size(0))) 1837 if dim == 2:
-> 1838 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index) 1839 elif dim == 4: 1840 ret = torch._C._nn.nll_loss2d(input, target,
weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes'
failed. at
C:\Users\builder\AppData\Local\Temp\pip-req-build-0i480kur\aten\src\THNN/generic/ClassNLLCriterion.c:97
This happens when there are either incorrect labels in your dataset, or the labels are 1-indexed (instead of 0-indexed). As from the error message, cur_target must be smaller than the total number of classes (10). To verify the issue, check the maximum and minimum label in your dataset. If the data is indeed 1-indexed, just minus one from all annotations and you should be fine.
Note, another possible reason is that there exists some -1 labels in the data. Some (esp older) datasets use -1 as indication of a wrong/dubious label. If you find such labels, just discard them.

TensorFlow learning rate decay - how to properly supply the step number for decay?

I am training my deep network in TensorFlow and I am trying to use a learning rate decay with it. As far as I see I should use train.exponential_decay function for that - it will calculate the proper learning rate value for current training step using various parameters. I just need to provide it with a step which is performed right now. I suspected I should use tf.placeholder(tf.int32) as usual when I need to provide something into the network, but seems like I am wrong. When I do this I get the below error:
TypeError: Input 'ref' of 'AssignAdd' Op requires l-value input
What am I doing wrong? Unfortunately, I haven't managed to find some good example of network training with decay. My whole code is below. Network has 2 hidden ReLU layers, has L2 penalty on weights and has dropout on both hidden layers.
#We try the following - 2 ReLU layers
#Dropout on both of them
#Also L2 regularization on them
#and learning rate decay also
#batch size for SGD
batch_size = 128
#beta parameter for L2 loss
beta = 0.001
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#learning rate decay
#starting value, number of steps decay is performed,
#size of the decay
start_learning_rate = 0.05
decay_steps = 1000
decay_size = 0.95
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#now let's build our first hidden layer
#its weights
hidden_weights_1 = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases_1 = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer 1 itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer_1 = tf.nn.relu(tf.matmul(tf_train_dataset, hidden_weights_1) + hidden_biases_1)
#add dropout on hidden layer 1
#we pick up the probabylity of switching off the activation
#and perform the switch off of the activations
keep_prob = tf.placeholder("float")
hidden_layer_drop_1 = tf.nn.dropout(hidden_layer_1, keep_prob)
#now let's build our second hidden layer
#its weights
hidden_weights_2 = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_hidden_neurons]))
hidden_biases_2 = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer 2 itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer_2 = tf.nn.relu(tf.matmul(hidden_layer_drop_1, hidden_weights_2) + hidden_biases_2)
#add dropout on hidden layer 2
#we pick up the probabylity of switching off the activation
#and perform the switch off of the activations
hidden_layer_drop_2 = tf.nn.dropout(hidden_layer_2, keep_prob)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
#notice that upon training we use the switched off activations
#i.e. the variaction of hidden_layer with the dropout active
out_layer = tf.matmul(hidden_layer_drop_2,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights_1) +
beta*tf.nn.l2_loss(hidden_biases_1) +
beta*tf.nn.l2_loss(hidden_weights_2) +
beta*tf.nn.l2_loss(hidden_biases_2) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#variable to count number of steps taken
global_step = tf.placeholder(tf.int32)
#compute current learning rate
learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_steps, decay_size)
#use it in optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu_1 = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights_1) + hidden_biases_1)
valid_relu_2 = tf.nn.relu( tf.matmul(valid_relu_1, hidden_weights_2) + hidden_biases_2)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu_2, out_weights) + out_biases)
test_relu_1 = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights_1) + hidden_biases_1)
test_relu_2 = tf.nn.relu( tf.matmul( test_relu_1, hidden_weights_2) + hidden_biases_2)
test_prediction = tf.nn.softmax(tf.matmul(test_relu_2, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5, global_step: step}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Instead of using a placeholder for global_step, try using a Variable.
global_step = tf.Variable(0)
You will have to remove global_step from the feed_dict. Note that you don't have to increment global_step manually, tensorflow will do it automatically for you.

Resources