How to solve my problem of max_step parameter in pytorch?

How to solve my problem of max_step parameter in pytorch? - machine-learning

I'm trying to train source code.
class mymodel(pl.LightningModule):
def __init__(self, config , learning_rate = 1e-4, max_steps = 100000//2):
super(mymodel, self).__init__()
self.config = config
self.save_hyperparameters()
self.training_losses = []
self.validation_losses = []
self.max_steps = max_steps
def configure_optimizers(self):
return torch.optim.AdamW(self.parameters(), lr = self.hparams['learning_rate'])
def forward(self, batch_dict):
return answer_vector
def calculate_metrics(self, prediction, labels):
batch_size = len(prediction)
ac_score = 0
for (pred, gt) in zip(prediction, labels):
ac_score+= calculate_acc_score(pred.detach().cpu(), gt.detach().cpu())
ac_score = ac_score/batch_size
return ac_score
def training_step(self, batch, batch_idx):
answer_vector = self.forward(batch)
loss = nn.CrossEntropyLoss()(answer_vector.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(answer_vector, dim = -1)
train_acc = self.calculate_metrics(preds, batch['answer'])
train_acc = torch.tensor(train_acc)
return loss
def validation_step(self, batch, batch_idx):
logits = self.forward(batch)
loss = nn.CrossEntropyLoss()(logits.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(logits, dim = -1)
## Validation Accuracy
val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
val_acc = torch.tensor(val_acc)
## Logging
self.log('val_ce_loss', loss, prog_bar = True)
self.log('val_acc', val_acc, prog_bar = True)
return {'val_loss': loss, 'val_acc': val_acc}
def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure = None, on_tpu=False,
using_native_amp=False, using_lbfgs=False):
## Warmup for 1000 steps
if self.trainer.global_step < 1000:
lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
for pg in optimizer.param_groups:
pg['lr'] = lr_scale * self.hparams.learning_rate
## Linear Decay
else:
for pg in optimizer.param_groups:
pg['lr'] = polynomial(self.hparams.learning_rate, self.trainer.global_step, max_iter = self.max_steps)
optimizer.step(opt_closure)
optimizer.zero_grad()
In 5'th epoch (maybe less or more) I encountered error that stop training. so I increase max_step . But when I increase max_step(max_step==100K) I have this problem with loss and acc that
loss>100 && acc==0 .I attach screen of this problem.
enter image description here
What changing I should do in source code to continue training model without this problem?

Updates:
I see. It looks like your optimizer_step is actually for "scheduler," where it messes with the AdamW learning rate. You should directly apply the scheduler to configure_optimizers function. See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html?highlight=configure_optimizers#configure-optimizers
===
old answer:
By error, do you mean the line of val_ce_loss? If yes, that's not an error. It means the val_ce_loss of the current epoch is not within the top1 of historical epochs so the checkpoint won't be saved to the disk. Please refer to the argument of save_top_k in the checkpoint callback. https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html

Related

TypeError: auroc() missing 1 required positional argument: 'task'

I was trying to fine tune BERT base uncased on a small dataset of 1.5k fields which is quiet less however while running
trainer.fit(model, data_module)
when it goes to the 'model' for training which is:
class ElectionTagger(pl.LightningModule):
def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
super().__init__()
self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
self.n_training_steps = n_training_steps
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.BCELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def training_epoch_end(self, outputs):
labels = []
predictions = []
for output in outputs:
for out_labels in output["labels"].detach().cpu():
labels.append(out_labels)
for out_predictions in output["predictions"].detach().cpu():
predictions.append(out_predictions)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
for i, name in enumerate(LABEL_COLUMNS):
class_roc_auc = auroc(predictions[:, i], labels[:, i]) ##### ERROR ARISES HERE###
self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps
)
return dict(
optimizer=optimizer,
lr_scheduler=dict(
scheduler=scheduler,
interval='step'
)
)
Instead of implementing, I got an error which says
TypeError: auroc() missing 1 required positional argument: 'task'
It would be great if anyone could provide a solution to this.

in case of using auroc() function, you need to set the task argument to either 'binary', 'multiclass' or 'multilabel'. Or, you can use directly one of the binary_auroc(), multiclass_auroc() or multilabel_auroc() instead of auroc(). See docs for more information.

How to change the directory of mlflow logs?

I am using MLflow to log the metrics but I want to change the default saving logs directory. So, instead of writing log files besides my main file, I want to store them to /path/outputs/lg . I don't know how to change it. I use it without in the Model.
import os
from time import time
import mlflow
import numpy as np
import torch
import tqdm
# from segmentation_models_pytorch.utils import metrics
from AICore.emergency_landing.metrics import IoU, F1
from AICore.emergency_landing.utils import AverageMeter
from AICore.emergency_landing.utils import TBLogger
class Model:
def __init__(self, model, num_classes=5, ignore_index=0, optimizer=None, scheduler=None, criterion=None,
device=None, epochs=30, train_loader=None, val_loader=None, tb_logger: TBLogger = None,
logger=None,
best_model_path=None,
model_check_point_path=None,
load_from_best_model=None,
load_from_model_checkpoint=None,
early_stopping=None,
debug=False):
self.debug = debug
self.early_stopping = {
'init': early_stopping,
'changed': 0
}
self.optimizer = optimizer
self.scheduler = scheduler
self.criterion = criterion
self.device = device
self.epochs = epochs
self.train_loader = train_loader
self.val_loader = val_loader
self.model = model.to(device)
self.tb_logger = tb_logger
self.logger = logger
self.best_loss = np.Inf
if not os.path.exists(best_model_path):
os.makedirs(best_model_path)
self.best_model_path = best_model_path
if not os.path.exists(model_check_point_path):
os.makedirs(model_check_point_path)
self.model_check_point_path = model_check_point_path
self.load_from_best_model = load_from_best_model
self.load_from_model_checkpoint = load_from_model_checkpoint
if self.load_from_best_model is not None:
self.load_model(path=self.load_from_best_model)
if self.load_from_model_checkpoint is not None:
self.load_model_checkpoint(path=self.load_from_model_checkpoint)
self.train_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.val_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.test_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.train_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.val_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.test_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
def metrics(self, is_train=True):
if is_train:
train_losses = AverageMeter('Training Loss', ':.4e')
train_iou = AverageMeter('Training iou', ':6.2f')
train_f_score = AverageMeter('Training F_score', ':6.2f')
return train_losses, train_iou, train_f_score
else:
val_losses = AverageMeter('Validation Loss', ':.4e')
val_iou = AverageMeter('Validation mean iou', ':6.2f')
val_f_score = AverageMeter('Validation F_score', ':6.2f')
return val_losses, val_iou, val_f_score
def fit(self):
self.logger.info("\nStart training\n\n")
start_training_time = time()
with mlflow.start_run():
for e in range(self.epochs):
start_training_epoch_time = time()
self.model.train()
train_losses_avg, train_iou_avg, train_f_score_avg = self.metrics(is_train=True)
with tqdm.tqdm(self.train_loader, unit="batch") as tepoch:
tepoch.set_description(f"Epoch {e}")
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# Clear the gradients
self.optimizer.zero_grad()
# Forward Pass
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
train_losses_avg.update(loss.item(), image.size(0))
# Calculate gradients
loss.backward()
# Update Weights
self.optimizer.step()
iou = self.train_iou(out.cpu(), target.cpu()).item()
train_iou_avg.update(iou)
f1_score = self.train_f1(out.cpu(), target.cpu()).item()
train_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=train_losses_avg.avg,
iou=train_iou_avg.avg,
f_score=train_f_score_avg.avg)
if self.debug:
break
self.tb_logger.log(log_type='criterion/training', value=train_losses_avg.avg, epoch=e)
self.tb_logger.log(log_type='iou/training', value=train_iou_avg.avg, epoch=e)
self.tb_logger.log(log_type='f_score/training', value=train_f_score_avg.avg, epoch=e)
mlflow.log_metric('criterion/training', train_losses_avg.avg, step=e)
mlflow.log_metric('iou/training', train_iou_avg.avg, step=e)
mlflow.log_metric('f_score/training', train_f_score_avg.avg, step=e)
end_training_epoch_time = time() - start_training_epoch_time
print('\n')
self.logger.info(
f'Training Results - [{end_training_epoch_time:.3f}s] Epoch: {e}:'
f' f_score: {train_f_score_avg.avg:.3f},'
f' IoU: {train_iou_avg.avg:.3f},'
f' Loss: {train_losses_avg.avg:.3f}')
# validation step
val_loss = self.evaluation(e)
# apply scheduler
if self.scheduler:
self.scheduler.step()
# early stopping
if self.early_stopping['init'] >= self.early_stopping['changed']:
self._early_stopping_model(val_loss=val_loss)
else:
print(f'The model can not learn more, Early Stopping at epoch[{e}]')
break
# save best model
if self.best_model_path is not None:
self._best_model(val_loss=val_loss, path=self.best_model_path)
# model check points
if self.model_check_point_path is not None:
self.save_model_check_points(path=self.model_check_point_path, epoch=e, net=self.model,
optimizer=self.optimizer, loss=self.criterion,
avg_loss=train_losses_avg.avg)
# log mlflow
if self.scheduler:
mlflow.log_param("get_last_lr", self.scheduler.get_last_lr())
mlflow.log_param("scheduler", self.scheduler.state_dict())
self.tb_logger.flush()
if self.debug:
break
end_training_time = time() - start_training_time
print(f'Finished Training after {end_training_time:.3f}s')
self.tb_logger.close()
def evaluation(self, epoch):
print('Validating...')
start_validation_epoch_time = time()
self.model.eval() # Optional when not using Model Specific layer
with torch.no_grad():
val_losses_avg, val_iou_avg, val_f_score_avg = self.metrics(is_train=False)
with tqdm.tqdm(self.val_loader, unit="batch") as tepoch:
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
val_losses_avg.update(loss.item(), image.size(0))
iou = self.val_iou(out.cpu(), target.cpu()).item()
val_iou_avg.update(iou)
f1_score = self.val_f1(out.cpu(), target.cpu()).item()
val_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=val_losses_avg.avg,
iou=val_iou_avg.avg,
f_score=val_f_score_avg.avg)
if self.debug:
break
print('\n')
self.tb_logger.log(log_type='criterion/validation', value=val_losses_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='iou/validation', value=val_iou_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='f_score/validation', value=val_f_score_avg.avg, epoch=epoch)
mlflow.log_metric('criterion/validation', val_losses_avg.avg, step=epoch)
mlflow.log_metric('iou/validation', val_iou_avg.avg, step=epoch)
mlflow.log_metric('f_score/validation', val_f_score_avg.avg, step=epoch)
end_validation_epoch_time = time() - start_validation_epoch_time
self.logger.info(
f'validation Results - [{end_validation_epoch_time:.3f}s] Epoch: {epoch}:'
f' f_score: {val_f_score_avg.avg:.3f},'
f' IoU: {val_iou_avg.avg:.3f},'
f' Loss: {val_losses_avg.avg:.3f}')
print('\n')
return val_losses_avg.avg
def _save_model(self, name, path, params):
torch.save(params, path)
def _early_stopping_model(self, val_loss):
if self.best_loss < val_loss:
self.early_stopping['changed'] += 1
else:
self.early_stopping['changed'] = 0
def _best_model(self, val_loss, path):
if self.best_loss > val_loss:
self.best_loss = val_loss
name = f'/best_model_loss_{self.best_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'model_state_dict': self.model.state_dict(),
})
print(f'The best model is saved with criterion: {self.best_loss:.2f}')
def save_model_check_points(self, path, epoch, net, optimizer, loss, avg_loss):
name = f'/model_epoch_{epoch}_loss_{avg_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'epoch': epoch,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'criterion': loss,
})
print(f'model checkpoint is saved at model_epoch_{epoch}_loss_{avg_loss:.2f}')
def load_model_checkpoint(self, path):
checkpoint = torch.load(path)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
self.criterion = checkpoint['criterion']
return epoch
def load_model(self, path):
best_model = torch.load(path)
self.model.load_state_dict(best_model['model_state_dict'])

The solution is:
mlflow.set_tracking_uri(uri=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
exp = mlflow.get_experiment_by_name(name='Emegency_landing')
if not exp:
experiment_id = mlflow.create_experiment(name='Emegency_landing',
artifact_location=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
else:
experiment_id = exp.experiment_id
And then you should pass the experiment Id to:
with mlflow.start_run(experiment_id=experiment_id):
pass
If you don't mention the /path/mlruns, when you run the command of mlflow ui, it will create another folder automatically named mlruns. so, pay attention to this point to have the same name as mlruns.

Why is the mean squared error increasing over epochs?

I am training a neuron network, and I encounter this phenomenon which is the loss is decreasing while the mse metric is increasing. I still cannot figure it out the problem.
Here is my custom mean squared error code
class custom_MSE(tf.keras.metrics.Metric):
def __init__(self, name='custom_mse', **kwargs):
super(custom_MSE, self).__init__(name=name, **kwargs)
self.true_positives = self.add_weight(name='tp', initializer='zeros')
def update_state(self, y_true, y_pred, sample_weight=None):
y_true = tf.convert_to_tensor(y_true)
y_pred = tf.convert_to_tensor(y_pred)
batch_size = tf.shape(y_true)[0]
y_h = int(y_true.shape[1]//4)
y_true_reshape = tf.reshape(y_true,shape=(batch_size,y_h,4))
y_pred_reshape = tf.reshape(y_pred,shape=(batch_size,y_h,4))
y_true_ = y_true_reshape[:,:,:2] # shape = (16,7,2) for example y_true_test_h_l[:,8:] = np.nan
y_pred_ = y_pred_reshape[:,:,:2]
y_true_ = tf.cast(y_true_, tf.float32)
y_pred_ = tf.cast(y_pred_, tf.float32)
# y_true_reg = y_true[:,:2]
# y_pred_reg = y_pred[:,:2]
loss = K.square(y_true_ - y_pred_)
loss = tf.experimental.numpy.nanmean(loss,axis=1)
# loss = tf.experimental.numpy.nanmean(loss,axis=0)
# tf.print(loss)
if sample_weight is not None:
sample_weight = tf.cast(sample_weight, self.dtype)
values = tf.multiply(values, sample_weight)
self.true_positives.assign_add(tf.reduce_mean(loss))
def result(self):
return self.true_positives
def reset_state(self):
self.true_positives.assign(0)

The problem is in the line of code self.true_positives.assign_add(tf.reduce_mean(loss))
It should be self.true_positives.assign(tf.reduce_mean(loss))

I have a trained tensorflow model, how do i make predictions with the same?

I have trained my models by calling the 'train_neural_network' function which trains the model and I store the model, the accuracy comes to around 83%, the problem I'm facing is how do I make predictions using my saved model? Which variable to restore and how to pass the input(in batches or whole at once)?
def make_model(data,train_x):
n_nodes_hl1 = 2000
n_nodes_hl2 = 2000
n_nodes_hl3 = 2000
n_classes = 2 # No of classification
hidden_1_layer = {'weights': tf.Variable(tf.truncated_normal([len(train_x[0]), n_nodes_hl1], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl1]),name = 'biases')}
hidden_2_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl1, n_nodes_hl2], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl2]),name = 'biases')}
hidden_3_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl2, n_nodes_hl3], stddev=0.1,),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_nodes_hl3]),name = 'biases')}
output_layer = {'weights': tf.Variable(tf.truncated_normal([n_nodes_hl3, n_classes], stddev=0.1),name= 'weights'),
'biases': tf.Variable(tf.constant(0.1, shape=[n_classes]),name = 'biases'), }
layer_1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
# now goes through an activation function - sigmoid function
layer_1 = tf.nn.relu(layer_1)
print ("Layer 1 done!!")
# input for layer 2 = result of activ_func for layer 1
layer_2 = tf.add(tf.matmul(layer_1, hidden_2_layer['weights']), hidden_2_layer['biases'])
layer_2 = tf.nn.relu(layer_2)
print ("Layer 2 done!!")
layer_3 = tf.add(tf.matmul(layer_2, hidden_3_layer['weights']), hidden_3_layer['biases'])
layer_3 = tf.nn.relu(layer_3)
print ("Layer 3 done!!")
output = tf.matmul(layer_3, output_layer['weights'],name = "output") + output_layer['biases']
return output
def train_neural_network(train_x,train_y,test_x,test_y):
tf.reset_default_graph()
with tf.name_scope('input'):
x = tf.placeholder('float', [None, len(train_x[0])],name= 'x_input')
y = tf.placeholder('float',name = 'y-input')
# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
prediction = make_model(x,train_x)
print ('model ready!!')
with tf.name_scope('pred'):
pred = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)
with tf.name_scope('cost'):
cost = tf.reduce_mean(pred)
with tf.name_scope('train'):
optimizer = tf.train.AdamOptimizer().minimize(cost,name = 'optimizer')
tf.summary.scalar("cost", cost)
n_epochs = 10
batch_size = 100
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()) # initializes our variables. Session has now begun.
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter('train/2/',
sess.graph)
test_writer = tf.summary.FileWriter('test/')
for epoch in range(n_epochs):
epoch_loss = 0 # we'll calculate the loss as we go
i = 0
while i < len(train_x):
#we want to take batches(chunks); take a slice, then another size)
start = i
end = i+batch_size
batch_x = np.array(train_x[start:end])
batch_y = np.array(train_y[start:end])
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
if i%200 == 0:
train_writer.add_summary(_, i)
epoch_loss += c
i+=batch_size
print('Epoch', epoch, 'completed out of', n_epochs, 'loss:', epoch_loss)
with tf.name_scope('accuracy'):
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
tf.summary.scalar("accuracy", accuracy)
print('Accuracy:', accuracy.eval({x: test_x, y: test_y}))
saver = tf.train.Saver()
tf_log = 'tf.log'
saver.save(sess, "model3.ckpt")
return accuracy
This is how I am making predictions, but this fails everytime:
def test_neural_network(test_x):
batch_size = 100
i = 0
batch_x = np.array(test_x[i:i+batch_size])
tf.reset_default_graph()
x = tf.placeholder('float', [len(batch_x),len(test_x[0])])
y = tf.placeholder('float',[2])
prediction = make_model(x,batch_x)
# pred1 = tf.nn.softmax(logits=prediction)
# weight = tf.get_variable("weights_3", shape=[len(batch_x),2],initializer = tf.zeros_initializer)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.restore(sess, "model3.ckpt")
p = tf.argmax(prediction,1)
print (p.eval({x:batch_x))
gives and array of shape(batch_size,2),expected values [0,1] or [1,0] but getting decimal values.

You have a problem because you launch a session on your variable « weight ». But in your case you want to know the output of your network. Try to launch a session on your last layer 😉

How do I make predictions using my saved model? Which variable to
restore and how to pass the input (in batches or whole at once)?
Several comments regarding your design. You don't have to rebuild the graph at test time, because it's saved right next to the session checkpoint. Take a look at this question.
With this, your code will be simplified a lot, because you don't have to keep the placeholders and cross-entropy loss function separately. Add the name to the softmax layer like this:
with tf.name_scope('pred'):
pred = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y, name='softmax')
After you've restored the graph, you can find the target operation by:
graph = sess.graph
pred = graph.get_operation_by_name("pred/softmax")
If your test data is not big, you can freely feed all of it at once, but if it's significantly larger than your batch size, you can easily get out-of-memory. In this case, you should use mini-batches for testing as well.
As for your test accuracy, there can be plenty reasons for this, for instance, overfitting. Update the question with the full code, so that it could be reproduced.

Batch Training Accuracy is always multiple of 10%

So I am training a CNN and compute the training accuracy for each batch. Most of the it gives out 100% batch training accuracy. which I though was okay because I'm testing my model against the data I trained it with. But at some iterations, I get a 90% or 90% batch training accuracy. And worst, sometimes it goes down to 0% real quick and bounces back to 100% batch training accuracy. And I used the algorithm in https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/04_Save_Restore.ipynb and they also computed the batch training accuracy but they don't get the same results I get. They started out with around 80% batch training accuracy and observed a gradual increase until 98%. Why is this?
I was suspecting that my network is overfitting.
Here is my exact code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import pyfftw
from scipy import signal
import xlrd
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib
import time
from datetime import timedelta
import math
import os
from sklearn.metrics import confusion_matrix
##matplotlib inline
plt.style.use('ggplot')
## define funtions
def read_data(file_path):
## column_names = ['user-id','activity','timestamp', 'x-axis', 'y-axis', 'z-axis']
column_names = ['activity','timestamp', 'Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz', 'Mx', 'My', 'Mz'] ## 3 sensors
data = pd.read_csv(file_path,header = None, names = column_names)
return data
def feature_normalize(dataset):
mu = np.mean(dataset,axis = 0)
sigma = np.std(dataset,axis = 0)
return (dataset - mu)/sigma
def plot_axis(ax, x, y, title):
ax.plot(x, y)
ax.set_title(title)
ax.xaxis.set_visible(False)
ax.set_ylim([min(y) - np.std(y), max(y) + np.std(y)])
ax.set_xlim([min(x), max(x)])
ax.grid(True)
def plot_activity(activity,data):
fig, (ax0, ax1, ax2) = plt.subplots(nrows = 3, figsize = (15, 10), sharex = True)
plot_axis(ax0, data['timestamp'], data['Ax'], 'x-axis')
plot_axis(ax1, data['timestamp'], data['Ay'], 'y-axis')
plot_axis(ax2, data['timestamp'], data['Az'], 'z-axis')
plt.subplots_adjust(hspace=0.2)
fig.suptitle(activity)
plt.subplots_adjust(top=0.90)
plt.show()
def windows(data, size):
start = 0
while start < data.count():
yield start, start + size
start += (size / 2)
def segment_signal(data, window_size = None, num_channels=None): # edited
segments = np.empty((0,window_size,num_channels)) #change from 3 to 9 channels for AGM fusion #use variable num_channels=9
labels = np.empty((0))
for (n_start, n_end) in windows(data['timestamp'], window_size):
## x = data["x-axis"][start:end]
## y = data["y-axis"][start:end]
## z = data["z-axis"][start:end]
n_start = int(n_start)
n_end = int(n_end)
Ax = data["Ax"][n_start:n_end]
Ay = data["Ay"][n_start:n_end]
Az = data["Az"][n_start:n_end]
Gx = data["Gx"][n_start:n_end]
Gy = data["Gy"][n_start:n_end]
Gz = data["Gz"][n_start:n_end]
Mx = data["Mx"][n_start:n_end]
My = data["My"][n_start:n_end]
Mz = data["Mz"][n_start:n_end]
if(len(dataset['timestamp'][n_start:n_end]) == window_size): # include only windows with size of 90
segments = np.vstack([segments,np.dstack([Ax,Ay,Az,Gx,Gy,Gz,Mx,My,Mz])])
labels = np.append(labels,stats.mode(data["activity"][n_start:n_end])[0][0])
return segments, labels
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev = 0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.0, shape = shape)
return tf.Variable(initial)
def depthwise_conv2d(x, W):
return tf.nn.depthwise_conv2d(x,W, [1, 1, 1, 1], padding='VALID')
def apply_depthwise_conv(x,weights,biases):
return tf.nn.relu(tf.add(depthwise_conv2d(x, weights),biases))
def apply_max_pool(x,kernel_size,stride_size):
return tf.nn.max_pool(x, ksize=[1, 1, kernel_size, 1],
strides=[1, 1, stride_size, 1], padding='VALID')
#------------------------get dataset----------------------#
## run shoaib_dataset.py to generate dataset_shoaib_total.txt
## get data from dataset_shoaib_total.txt
dataset = read_data('dataset_shoaib_total.txt')
#--------------------preprocessing------------------------#
dataset['Ax'] = feature_normalize(dataset['Ax'])
dataset['Ay'] = feature_normalize(dataset['Ay'])
dataset['Az'] = feature_normalize(dataset['Az'])
dataset['Gx'] = feature_normalize(dataset['Gx'])
dataset['Gy'] = feature_normalize(dataset['Gy'])
dataset['Gz'] = feature_normalize(dataset['Gz'])
dataset['Mx'] = feature_normalize(dataset['Mx'])
dataset['My'] = feature_normalize(dataset['My'])
dataset['Mz'] = feature_normalize(dataset['Mz'])
###--------------------plot activity data----------------#
##for activity in np.unique(dataset["activity"]):
## subset = dataset[dataset["activity"] == activity][:180]
## plot_activity(activity,subset)
#------------------fixed hyperparameters--------------------#
window_size = 200 #from 90 #FIXED at 4 seconds
#----------------input hyperparameters------------------#
input_height = 1
input_width = window_size
num_labels = 6
num_channels = 9 #from 3 channels #9 channels for AGM
#-------------------sliding time window----------------#
segments, labels = segment_signal(dataset, window_size=window_size, num_channels=num_channels)
labels = np.asarray(pd.get_dummies(labels), dtype = np.int8)
reshaped_segments = segments.reshape(len(segments), (window_size*num_channels)) #use variable num_channels instead of constant 3 channels
#------------divide data into test and training set-----------#
train_test_split = np.random.rand(len(reshaped_segments)) < 0.80
train_x_init = reshaped_segments[train_test_split]
train_y_init = labels[train_test_split]
test_x = reshaped_segments[~train_test_split]
test_y = labels[~train_test_split]
train_validation_split = np.random.rand(len(train_x_init)) < 0.80
train_x = train_x_init[train_validation_split]
train_y = train_y_init[train_validation_split]
validation_x = train_x_init[~train_validation_split]
validation_y = train_y_init[~train_validation_split]
#---------------training hyperparameters----------------#
batch_size = 10
kernel_size = 60 #from 60 #optimal 2
depth = 15 #from 60 #optimal 15
num_hidden = 1000 #from 1000 #optimal 80
learning_rate = 0.0001
training_epochs = 8
total_batches = train_x.shape[0] ##// batch_size
#---------define placeholders for input----------#
X = tf.placeholder(tf.float32, shape=[None,input_width * num_channels], name="input")
X_reshaped = tf.reshape(X,[-1,input_height,input_width,num_channels])
Y = tf.placeholder(tf.float32, shape=[None,num_labels])
#---------------------perform convolution-----------------#
# first convolutional layer
c_weights = weight_variable([1, kernel_size, num_channels, depth])
c_biases = bias_variable([depth * num_channels])
c = apply_depthwise_conv(X_reshaped,c_weights,c_biases)
p = apply_max_pool(c,20,2)
# second convolutional layer
c2_weights = weight_variable([1, 6,depth*num_channels,depth//10])
c2_biases = bias_variable([(depth*num_channels)*(depth//10)])
c = apply_depthwise_conv(p,c2_weights,c2_biases)
#--------------flatten data for fully connected layers----------#
shape = c.get_shape().as_list()
c_flat = tf.reshape(c, [-1, shape[1] * shape[2] * shape[3]])
#------------fully connected layers----------------#
f_weights_l1 = weight_variable([shape[1] * shape[2] * depth * num_channels * (depth//10), num_hidden])
f_biases_l1 = bias_variable([num_hidden])
f = tf.nn.tanh(tf.add(tf.matmul(c_flat, f_weights_l1),f_biases_l1))
#----------------------dropout------------------#
keep_prob = tf.placeholder(tf.float32)
drop_layer = tf.nn.dropout(f, keep_prob)
#----------------------softmax layer----------------#
out_weights = weight_variable([num_hidden, num_labels])
out_biases = bias_variable([num_labels])
y_ = tf.nn.softmax(tf.add(tf.matmul(drop_layer, out_weights),out_biases), name="y_")
#-----------------loss optimization-------------#
loss = -tf.reduce_sum(Y * tf.log(y_))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
#-----------------compute accuracy---------------#
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
cost_history = np.empty(shape=[1],dtype=float)
saver = tf.train.Saver()
session = tf.Session()
session.run(tf.global_variables_initializer())
#-------------early stopping-----------------#
# Best validation accuracy seen so far.
best_validation_accuracy = 0.0
# Iteration-number for last improvement to validation accuracy.
last_improvement = 0
# Stop optimization if no improvement found in this many iterations.
require_improvement = 1000
# Counter for total number of iterations performed so far.
total_iterations = 0
def validation_accuracy():
return session.run(accuracy, feed_dict={X: validation_x, Y: validation_y, keep_prob: 1.0})
def next_batch(b, batch_size, train_x, train_y):
##for b in range(total_batches):
offset = (b * batch_size) % (train_y.shape[0] - batch_size)
batch_x = train_x[offset:(offset + batch_size), :]
batch_y = train_y[offset:(offset + batch_size), :]
return batch_x, batch_y
def optimize(num_iterations):
# Ensure we update the global variables rather than local copies.
global total_iterations
global best_validation_accuracy
global last_improvement
# Start-time used for printing time-usage below.
start_time = time.time()
for i in range(num_iterations):
# Increase the total number of iterations performed.
# It is easier to update it in each iteration because
# we need this number several times in the following.
total_iterations += 1
# Get a batch of training examples.
# x_batch now holds a batch of images and
# y_true_batch are the true labels for those images.
##x_batch, y_true_batch = data.train.next_batch(train_batch_size)
x_batch, y_true_batch = next_batch(i, batch_size, train_x, train_y)
# Put the batch into a dict with the proper names
# for placeholder variables in the TensorFlow graph.
feed_dict_train = {X: x_batch,
Y: y_true_batch, keep_prob: 0.5}
# Run the optimizer using this batch of training data.
# TensorFlow assigns the variables in feed_dict_train
# to the placeholder variables and then runs the optimizer.
session.run(optimizer, feed_dict=feed_dict_train)
# Print status every 100 iterations and after last iteration.
if (total_iterations % 100 == 0) or (i == (num_iterations - 1)):
# Calculate the accuracy on the training-batch.
acc_train = session.run(accuracy, feed_dict={X: x_batch,
Y: y_true_batch, keep_prob: 1.0})
# Calculate the accuracy on the validation-set.
# The function returns 2 values but we only need the first.
##acc_validation, _ = validation_accuracy()
acc_validation = validation_accuracy()
# If validation accuracy is an improvement over best-known.
if acc_validation > best_validation_accuracy:
# Update the best-known validation accuracy.
best_validation_accuracy = acc_validation
# Set the iteration for the last improvement to current.
last_improvement = total_iterations
# Save all variables of the TensorFlow graph to file.
saver.save(sess=session, save_path="../shoaib-har_agm_es.ckpt")
# A string to be printed below, shows improvement found.
improved_str = '*'
else:
# An empty string to be printed below.
# Shows that no improvement was found.
improved_str = ''
# Status-message for printing.
msg = "Iter: {0:>6}, Train-Batch Accuracy: {1:>6.1%}, Validation Acc: {2:>6.1%} {3}"
# Print it.
print(msg.format(i + 1, acc_train, acc_validation, improved_str))
# If no improvement found in the required number of iterations.
if total_iterations - last_improvement > require_improvement:
print("No improvement found in a while, stopping optimization.")
# Break out from the for-loop.
break
# Ending time.
end_time = time.time()
# Difference between start and end-times.
time_dif = end_time - start_time
# Print the time-usage.
print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))
optimize(10000)
With the output:
What exactly is training accuracy? Is it even computed? Or do you compute the training accuracy on the entire training data and not just the batch you trained your network with?
Here I printed the results such that it prints out the batch training accuracy and the training accuracy on the entire dataset set for every multiples of 20 iterations.

The data is divided to 3 sets: train, validation and test.
Batch training accuracy is computed on the train set (the difference between the label and the prediction).
Validation accuracy is the accuracy on the validation set.
The batch accuracy can be computed just after a forward pass in the network. The number of samples in one forward pass is the batch size. It is just a way to train models faster (mini-batch gradient descent)
Overfitting is when the model works really good on known data (training set) but performs poorly on new data.
As to the 10% multiples, it is just the printing format you are using.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

How to solve my problem of max_step parameter in pytorch? - machine-learning

Related

TypeError: auroc() missing 1 required positional argument: 'task'

How to change the directory of mlflow logs?

Why is the mean squared error increasing over epochs?

I have a trained tensorflow model, how do i make predictions with the same?

Batch Training Accuracy is always multiple of 10%

Categories

Resources