RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same - machine-learning

I set my model and data to the same device, but always raise the error like this:
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
The following is my training code, I hope you can answer it.Thanks!
def train(train_img_path, train_label_path, pths_path, interval, log_file):
file_num = len(os.listdir(train_img_path))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = EAST(extractor=extractor, geometry_mode=geometry_mode, pretrained=True)
net = net.to(device)
trainset = custom_dataset(train_img_path, train_label_path)
train_loader = data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=num_workers, drop_last=True)
optimizer = optim.SGD(net.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay_sgd)
criterion = Loss(weight_geo, weight_angle, geometry_mode="RBOX")
net.train()
epoch_loss = 0.
for epoch in range(max_epoch):
epoch_time = time.time()
for i, (img, score_gt, geo_gt, ignored_map) in enumerate(train_loader):
start_time = time.time()
img, score_gt, geo_gt, ignored_map = img.to(device), score_gt.to(device),\
geo_gt.to(device), ignored_map.to(device)
score_pred, geo_pred = net(img)
total_loss, score_loss, loss_AABB, loss_angle = criterion(score_pred, geo_pred, score_gt, geo_gt, ignored_map)
epoch_loss += total_loss.item()
optimizer.zero_grad()
total_loss.backward()
optimizer.step()

I suspect your loss function has some internal parameters of its own, therefore you should also
criterion = Loss(weight_geo, weight_angle, geometry_mode="RBOX").to(device)
It would be easier to spot the error if you provide a full trace, indicating which line exactly caused the error.

Related

TypeError: auroc() missing 1 required positional argument: 'task'

I was trying to fine tune BERT base uncased on a small dataset of 1.5k fields which is quiet less however while running
trainer.fit(model, data_module)
when it goes to the 'model' for training which is:
class ElectionTagger(pl.LightningModule):
def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
super().__init__()
self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
self.n_training_steps = n_training_steps
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.BCELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def training_epoch_end(self, outputs):
labels = []
predictions = []
for output in outputs:
for out_labels in output["labels"].detach().cpu():
labels.append(out_labels)
for out_predictions in output["predictions"].detach().cpu():
predictions.append(out_predictions)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
for i, name in enumerate(LABEL_COLUMNS):
class_roc_auc = auroc(predictions[:, i], labels[:, i]) ##### ERROR ARISES HERE###
self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps
)
return dict(
optimizer=optimizer,
lr_scheduler=dict(
scheduler=scheduler,
interval='step'
)
)
Instead of implementing, I got an error which says
TypeError: auroc() missing 1 required positional argument: 'task'
It would be great if anyone could provide a solution to this.
in case of using auroc() function, you need to set the task argument to either 'binary', 'multiclass' or 'multilabel'. Or, you can use directly one of the binary_auroc(), multiclass_auroc() or multilabel_auroc() instead of auroc(). See docs for more information.

How to solve my problem of max_step parameter in pytorch?

I'm trying to train source code.
class mymodel(pl.LightningModule):
def __init__(self, config , learning_rate = 1e-4, max_steps = 100000//2):
super(mymodel, self).__init__()
self.config = config
self.save_hyperparameters()
self.training_losses = []
self.validation_losses = []
self.max_steps = max_steps
def configure_optimizers(self):
return torch.optim.AdamW(self.parameters(), lr = self.hparams['learning_rate'])
def forward(self, batch_dict):
return answer_vector
def calculate_metrics(self, prediction, labels):
batch_size = len(prediction)
ac_score = 0
for (pred, gt) in zip(prediction, labels):
ac_score+= calculate_acc_score(pred.detach().cpu(), gt.detach().cpu())
ac_score = ac_score/batch_size
return ac_score
def training_step(self, batch, batch_idx):
answer_vector = self.forward(batch)
loss = nn.CrossEntropyLoss()(answer_vector.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(answer_vector, dim = -1)
train_acc = self.calculate_metrics(preds, batch['answer'])
train_acc = torch.tensor(train_acc)
return loss
def validation_step(self, batch, batch_idx):
logits = self.forward(batch)
loss = nn.CrossEntropyLoss()(logits.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(logits, dim = -1)
## Validation Accuracy
val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
val_acc = torch.tensor(val_acc)
## Logging
self.log('val_ce_loss', loss, prog_bar = True)
self.log('val_acc', val_acc, prog_bar = True)
return {'val_loss': loss, 'val_acc': val_acc}
def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure = None, on_tpu=False,
using_native_amp=False, using_lbfgs=False):
## Warmup for 1000 steps
if self.trainer.global_step < 1000:
lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
for pg in optimizer.param_groups:
pg['lr'] = lr_scale * self.hparams.learning_rate
## Linear Decay
else:
for pg in optimizer.param_groups:
pg['lr'] = polynomial(self.hparams.learning_rate, self.trainer.global_step, max_iter = self.max_steps)
optimizer.step(opt_closure)
optimizer.zero_grad()
In 5'th epoch (maybe less or more) I encountered error that stop training. so I increase max_step . But when I increase max_step(max_step==100K) I have this problem with loss and acc that
loss>100 && acc==0 .I attach screen of this problem.
enter image description here
What changing I should do in source code to continue training model without this problem?
Updates:
I see. It looks like your optimizer_step is actually for "scheduler," where it messes with the AdamW learning rate. You should directly apply the scheduler to configure_optimizers function. See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html?highlight=configure_optimizers#configure-optimizers
===
old answer:
By error, do you mean the line of val_ce_loss? If yes, that's not an error. It means the val_ce_loss of the current epoch is not within the top1 of historical epochs so the checkpoint won't be saved to the disk. Please refer to the argument of save_top_k in the checkpoint callback. https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html

validation loss is not changing at all (pytorch)

I'm using PyTorch for the first time to train my sentiment analysts model using Bert's pre-trained model.
this is my classifier
class SentimentClassifier2(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier2, self).__init__()
D_in, H, D_out = 768, 200, 3
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.4)
self.classifier = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
nn.Linear(H, D_out)
)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
output = self.drop(pooled_output)
logits = self.classifier(output)
return logits
this is my optimizer/loss function (I'm doing only 20 epochs cause it takes a while to train )
EPOCHS = 20
model2 = SentimentClassifier2(len(class_names))
model2= model2.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=True)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
training & evaluation code
def train_epoch( model, data_loader, loss_fn,optimizer, device, scheduler, n_examples):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
my problem: the loss for the validation samples is not changing at all !!!
epoch1:______________________
Train loss 1.0145157482929346 accuracy 0.4185746994848311
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch2:______________________
Train loss 1.015038197996413 accuracy 0.41871780194619346
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch3:______________________
Train loss 1.014710763787351 accuracy 0.4188609044075558
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch4:______________________
Train loss 1.0139196826735648 accuracy 0.41909940850982635
Val loss 1.002384223589083 accuracy 0.4151087371232354
I don't understand what the problem is ...!
I would be grateful if someone could help me ☹
Maybe you can try to it. following,
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.zero_grad()
optimizer.step()
scheduler.step()

Accuracy value goes up and down on the training process

After training the network I noticed that accuracy goes up and down. Initially I thought it is caused by the learning rate, but it is set to quite small value. Please check the screenshot attached.
Plot Accuracy Screenshot
My network (in Pytorch) looks as follow:
class Network(nn.Module):
def __init__(self):
super(Network,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
out = torch.sigmoid(out)
return out
I am using RMSprop as optimizer and BCELoss as criterion. The learning rate is set to 0.001
Here is the training process:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
if itr%p_itr == 0:
pred = torch.argmax(output, dim=1)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
My dataset is quite small - 2000 train and 1000 validation (binary classification 0/1). I wanted to do the 80/20 split but I was asked to keep it like that. I was thinking that the architecture might be too complex for such a small dataset.
Any hits what may cause such jumps in the training process?
Your code here is wrong: pred = torch.argmax(output, dim=1)
This line using for multiclass classification with Cross-Entropy Loss.
Your task is binary classification so the pred values are wrong. Change to:
if itr%p_itr == 0:
pred = torch.round(output)
....
You can change your optimizer to Adam, SGD, or RMSprop to find the suitable optimizer that helps your model coverage faster.
Also change the forward() function:
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return self.sigmoid(out) #use your forward is ok, but this cleaner

I have a trained tensorflow model, how do i make predictions with the same?

I have trained my models by calling the 'train_neural_network' function which trains the model and I store the model, the accuracy comes to around 83%, the problem I'm facing is how do I make predictions using my saved model? Which variable to restore and how to pass the input(in batches or whole at once)?
def make_model(data,train_x):
n_nodes_hl1 = 2000
n_nodes_hl2 = 2000
n_nodes_hl3 = 2000
n_classes = 2 # No of classification
hidden_1_layer = {'weights': tf.Variable(tf.truncated_normal([len(train_x[0]), n_nodes_hl1], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl1]),name = 'biases')}
hidden_2_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl1, n_nodes_hl2], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl2]),name = 'biases')}
hidden_3_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl2, n_nodes_hl3], stddev=0.1,),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl3]),name = 'biases')}
output_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl3, n_classes], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_classes]),name = 'biases'), }
layer_1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
# now goes through an activation function - sigmoid function
layer_1 = tf.nn.relu(layer_1)
print ("Layer 1 done!!")
# input for layer 2 = result of activ_func for layer 1
layer_2 = tf.add(tf.matmul(layer_1, hidden_2_layer['weights']), hidden_2_layer['biases'])
layer_2 = tf.nn.relu(layer_2)
print ("Layer 2 done!!")
layer_3 = tf.add(tf.matmul(layer_2, hidden_3_layer['weights']), hidden_3_layer['biases'])
layer_3 = tf.nn.relu(layer_3)
print ("Layer 3 done!!")
output = tf.matmul(layer_3, output_layer['weights'],name = "output") + output_layer['biases']
return output
def train_neural_network(train_x,train_y,test_x,test_y):
tf.reset_default_graph()
with tf.name_scope('input'):
x = tf.placeholder('float', [None, len(train_x[0])],name= 'x_input')
y = tf.placeholder('float',name = 'y-input')
# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
prediction = make_model(x,train_x)
print ('model ready!!')
with tf.name_scope('pred'):
pred = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)
with tf.name_scope('cost'):
cost = tf.reduce_mean(pred)
with tf.name_scope('train'):
optimizer = tf.train.AdamOptimizer().minimize(cost,name = 'optimizer')
tf.summary.scalar("cost", cost)
n_epochs = 10
batch_size = 100
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()) # initializes our variables. Session has now begun.
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter('train/2/',
sess.graph)
test_writer = tf.summary.FileWriter('test/')
for epoch in range(n_epochs):
epoch_loss = 0 # we'll calculate the loss as we go
i = 0
while i < len(train_x):
#we want to take batches(chunks); take a slice, then another size)
start = i
end = i+batch_size
batch_x = np.array(train_x[start:end])
batch_y = np.array(train_y[start:end])
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
if i%200 == 0:
train_writer.add_summary(_, i)
epoch_loss += c
i+=batch_size
print('Epoch', epoch, 'completed out of', n_epochs, 'loss:', epoch_loss)
with tf.name_scope('accuracy'):
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
tf.summary.scalar("accuracy", accuracy)
print('Accuracy:', accuracy.eval({x: test_x, y: test_y}))
saver = tf.train.Saver()
tf_log = 'tf.log'
saver.save(sess, "model3.ckpt")
return accuracy
This is how I am making predictions, but this fails everytime:
def test_neural_network(test_x):
batch_size = 100
i = 0
batch_x = np.array(test_x[i:i+batch_size])
tf.reset_default_graph()
x = tf.placeholder('float', [len(batch_x),len(test_x[0])])
y = tf.placeholder('float',[2])
prediction = make_model(x,batch_x)
# pred1 = tf.nn.softmax(logits=prediction)
# weight = tf.get_variable("weights_3", shape=[len(batch_x),2],initializer = tf.zeros_initializer)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.restore(sess, "model3.ckpt")
p = tf.argmax(prediction,1)
print (p.eval({x:batch_x))
gives and array of shape(batch_size,2),expected values [0,1] or [1,0] but getting decimal values.
You have a problem because you launch a session on your variable « weight ». But in your case you want to know the output of your network. Try to launch a session on your last layer 😉
How do I make predictions using my saved model? Which variable to
restore and how to pass the input (in batches or whole at once)?
Several comments regarding your design. You don't have to rebuild the graph at test time, because it's saved right next to the session checkpoint. Take a look at this question.
With this, your code will be simplified a lot, because you don't have to keep the placeholders and cross-entropy loss function separately. Add the name to the softmax layer like this:
with tf.name_scope('pred'):
pred = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y, name='softmax')
After you've restored the graph, you can find the target operation by:
graph = sess.graph
pred = graph.get_operation_by_name("pred/softmax")
If your test data is not big, you can freely feed all of it at once, but if it's significantly larger than your batch size, you can easily get out-of-memory. In this case, you should use mini-batches for testing as well.
As for your test accuracy, there can be plenty reasons for this, for instance, overfitting. Update the question with the full code, so that it could be reproduced.

Resources