I am using MLflow to log the metrics but I want to change the default saving logs directory. So, instead of writing log files besides my main file, I want to store them to /path/outputs/lg . I don't know how to change it. I use it without in the Model.
import os
from time import time
import mlflow
import numpy as np
import torch
import tqdm
# from segmentation_models_pytorch.utils import metrics
from AICore.emergency_landing.metrics import IoU, F1
from AICore.emergency_landing.utils import AverageMeter
from AICore.emergency_landing.utils import TBLogger
class Model:
def __init__(self, model, num_classes=5, ignore_index=0, optimizer=None, scheduler=None, criterion=None,
device=None, epochs=30, train_loader=None, val_loader=None, tb_logger: TBLogger = None,
logger=None,
best_model_path=None,
model_check_point_path=None,
load_from_best_model=None,
load_from_model_checkpoint=None,
early_stopping=None,
debug=False):
self.debug = debug
self.early_stopping = {
'init': early_stopping,
'changed': 0
}
self.optimizer = optimizer
self.scheduler = scheduler
self.criterion = criterion
self.device = device
self.epochs = epochs
self.train_loader = train_loader
self.val_loader = val_loader
self.model = model.to(device)
self.tb_logger = tb_logger
self.logger = logger
self.best_loss = np.Inf
if not os.path.exists(best_model_path):
os.makedirs(best_model_path)
self.best_model_path = best_model_path
if not os.path.exists(model_check_point_path):
os.makedirs(model_check_point_path)
self.model_check_point_path = model_check_point_path
self.load_from_best_model = load_from_best_model
self.load_from_model_checkpoint = load_from_model_checkpoint
if self.load_from_best_model is not None:
self.load_model(path=self.load_from_best_model)
if self.load_from_model_checkpoint is not None:
self.load_model_checkpoint(path=self.load_from_model_checkpoint)
self.train_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.val_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.test_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.train_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.val_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.test_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
def metrics(self, is_train=True):
if is_train:
train_losses = AverageMeter('Training Loss', ':.4e')
train_iou = AverageMeter('Training iou', ':6.2f')
train_f_score = AverageMeter('Training F_score', ':6.2f')
return train_losses, train_iou, train_f_score
else:
val_losses = AverageMeter('Validation Loss', ':.4e')
val_iou = AverageMeter('Validation mean iou', ':6.2f')
val_f_score = AverageMeter('Validation F_score', ':6.2f')
return val_losses, val_iou, val_f_score
def fit(self):
self.logger.info("\nStart training\n\n")
start_training_time = time()
with mlflow.start_run():
for e in range(self.epochs):
start_training_epoch_time = time()
self.model.train()
train_losses_avg, train_iou_avg, train_f_score_avg = self.metrics(is_train=True)
with tqdm.tqdm(self.train_loader, unit="batch") as tepoch:
tepoch.set_description(f"Epoch {e}")
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# Clear the gradients
self.optimizer.zero_grad()
# Forward Pass
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
train_losses_avg.update(loss.item(), image.size(0))
# Calculate gradients
loss.backward()
# Update Weights
self.optimizer.step()
iou = self.train_iou(out.cpu(), target.cpu()).item()
train_iou_avg.update(iou)
f1_score = self.train_f1(out.cpu(), target.cpu()).item()
train_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=train_losses_avg.avg,
iou=train_iou_avg.avg,
f_score=train_f_score_avg.avg)
if self.debug:
break
self.tb_logger.log(log_type='criterion/training', value=train_losses_avg.avg, epoch=e)
self.tb_logger.log(log_type='iou/training', value=train_iou_avg.avg, epoch=e)
self.tb_logger.log(log_type='f_score/training', value=train_f_score_avg.avg, epoch=e)
mlflow.log_metric('criterion/training', train_losses_avg.avg, step=e)
mlflow.log_metric('iou/training', train_iou_avg.avg, step=e)
mlflow.log_metric('f_score/training', train_f_score_avg.avg, step=e)
end_training_epoch_time = time() - start_training_epoch_time
print('\n')
self.logger.info(
f'Training Results - [{end_training_epoch_time:.3f}s] Epoch: {e}:'
f' f_score: {train_f_score_avg.avg:.3f},'
f' IoU: {train_iou_avg.avg:.3f},'
f' Loss: {train_losses_avg.avg:.3f}')
# validation step
val_loss = self.evaluation(e)
# apply scheduler
if self.scheduler:
self.scheduler.step()
# early stopping
if self.early_stopping['init'] >= self.early_stopping['changed']:
self._early_stopping_model(val_loss=val_loss)
else:
print(f'The model can not learn more, Early Stopping at epoch[{e}]')
break
# save best model
if self.best_model_path is not None:
self._best_model(val_loss=val_loss, path=self.best_model_path)
# model check points
if self.model_check_point_path is not None:
self.save_model_check_points(path=self.model_check_point_path, epoch=e, net=self.model,
optimizer=self.optimizer, loss=self.criterion,
avg_loss=train_losses_avg.avg)
# log mlflow
if self.scheduler:
mlflow.log_param("get_last_lr", self.scheduler.get_last_lr())
mlflow.log_param("scheduler", self.scheduler.state_dict())
self.tb_logger.flush()
if self.debug:
break
end_training_time = time() - start_training_time
print(f'Finished Training after {end_training_time:.3f}s')
self.tb_logger.close()
def evaluation(self, epoch):
print('Validating...')
start_validation_epoch_time = time()
self.model.eval() # Optional when not using Model Specific layer
with torch.no_grad():
val_losses_avg, val_iou_avg, val_f_score_avg = self.metrics(is_train=False)
with tqdm.tqdm(self.val_loader, unit="batch") as tepoch:
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
val_losses_avg.update(loss.item(), image.size(0))
iou = self.val_iou(out.cpu(), target.cpu()).item()
val_iou_avg.update(iou)
f1_score = self.val_f1(out.cpu(), target.cpu()).item()
val_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=val_losses_avg.avg,
iou=val_iou_avg.avg,
f_score=val_f_score_avg.avg)
if self.debug:
break
print('\n')
self.tb_logger.log(log_type='criterion/validation', value=val_losses_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='iou/validation', value=val_iou_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='f_score/validation', value=val_f_score_avg.avg, epoch=epoch)
mlflow.log_metric('criterion/validation', val_losses_avg.avg, step=epoch)
mlflow.log_metric('iou/validation', val_iou_avg.avg, step=epoch)
mlflow.log_metric('f_score/validation', val_f_score_avg.avg, step=epoch)
end_validation_epoch_time = time() - start_validation_epoch_time
self.logger.info(
f'validation Results - [{end_validation_epoch_time:.3f}s] Epoch: {epoch}:'
f' f_score: {val_f_score_avg.avg:.3f},'
f' IoU: {val_iou_avg.avg:.3f},'
f' Loss: {val_losses_avg.avg:.3f}')
print('\n')
return val_losses_avg.avg
def _save_model(self, name, path, params):
torch.save(params, path)
def _early_stopping_model(self, val_loss):
if self.best_loss < val_loss:
self.early_stopping['changed'] += 1
else:
self.early_stopping['changed'] = 0
def _best_model(self, val_loss, path):
if self.best_loss > val_loss:
self.best_loss = val_loss
name = f'/best_model_loss_{self.best_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'model_state_dict': self.model.state_dict(),
})
print(f'The best model is saved with criterion: {self.best_loss:.2f}')
def save_model_check_points(self, path, epoch, net, optimizer, loss, avg_loss):
name = f'/model_epoch_{epoch}_loss_{avg_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'epoch': epoch,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'criterion': loss,
})
print(f'model checkpoint is saved at model_epoch_{epoch}_loss_{avg_loss:.2f}')
def load_model_checkpoint(self, path):
checkpoint = torch.load(path)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
self.criterion = checkpoint['criterion']
return epoch
def load_model(self, path):
best_model = torch.load(path)
self.model.load_state_dict(best_model['model_state_dict'])
The solution is:
mlflow.set_tracking_uri(uri=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
exp = mlflow.get_experiment_by_name(name='Emegency_landing')
if not exp:
experiment_id = mlflow.create_experiment(name='Emegency_landing',
artifact_location=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
else:
experiment_id = exp.experiment_id
And then you should pass the experiment Id to:
with mlflow.start_run(experiment_id=experiment_id):
pass
If you don't mention the /path/mlruns, when you run the command of mlflow ui, it will create another folder automatically named mlruns. so, pay attention to this point to have the same name as mlruns.
Related
I'm trying to train source code.
class mymodel(pl.LightningModule):
def __init__(self, config , learning_rate = 1e-4, max_steps = 100000//2):
super(mymodel, self).__init__()
self.config = config
self.save_hyperparameters()
self.training_losses = []
self.validation_losses = []
self.max_steps = max_steps
def configure_optimizers(self):
return torch.optim.AdamW(self.parameters(), lr = self.hparams['learning_rate'])
def forward(self, batch_dict):
return answer_vector
def calculate_metrics(self, prediction, labels):
batch_size = len(prediction)
ac_score = 0
for (pred, gt) in zip(prediction, labels):
ac_score+= calculate_acc_score(pred.detach().cpu(), gt.detach().cpu())
ac_score = ac_score/batch_size
return ac_score
def training_step(self, batch, batch_idx):
answer_vector = self.forward(batch)
loss = nn.CrossEntropyLoss()(answer_vector.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(answer_vector, dim = -1)
train_acc = self.calculate_metrics(preds, batch['answer'])
train_acc = torch.tensor(train_acc)
return loss
def validation_step(self, batch, batch_idx):
logits = self.forward(batch)
loss = nn.CrossEntropyLoss()(logits.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(logits, dim = -1)
## Validation Accuracy
val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
val_acc = torch.tensor(val_acc)
## Logging
self.log('val_ce_loss', loss, prog_bar = True)
self.log('val_acc', val_acc, prog_bar = True)
return {'val_loss': loss, 'val_acc': val_acc}
def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure = None, on_tpu=False,
using_native_amp=False, using_lbfgs=False):
## Warmup for 1000 steps
if self.trainer.global_step < 1000:
lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
for pg in optimizer.param_groups:
pg['lr'] = lr_scale * self.hparams.learning_rate
## Linear Decay
else:
for pg in optimizer.param_groups:
pg['lr'] = polynomial(self.hparams.learning_rate, self.trainer.global_step, max_iter = self.max_steps)
optimizer.step(opt_closure)
optimizer.zero_grad()
In 5'th epoch (maybe less or more) I encountered error that stop training. so I increase max_step . But when I increase max_step(max_step==100K) I have this problem with loss and acc that
loss>100 && acc==0 .I attach screen of this problem.
enter image description here
What changing I should do in source code to continue training model without this problem?
Updates:
I see. It looks like your optimizer_step is actually for "scheduler," where it messes with the AdamW learning rate. You should directly apply the scheduler to configure_optimizers function. See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html?highlight=configure_optimizers#configure-optimizers
===
old answer:
By error, do you mean the line of val_ce_loss? If yes, that's not an error. It means the val_ce_loss of the current epoch is not within the top1 of historical epochs so the checkpoint won't be saved to the disk. Please refer to the argument of save_top_k in the checkpoint callback. https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html
I was trying out the wandb library and I run wandb.watch but that doesn't seem to work on my code. It's not supposed to be anything to complicated so I am puzzled why it's not working.
Code:
"""
https://docs.wandb.ai/guides/track/advanced/distributed-training
import wandb
# 1. Start a new run
wandb.init(project='playground', entity='brando')
# 2. Save model inputs and hyperparameters
config = wandb.config
config.learning_rate = 0.01
# 3. Log gradients and model parameters
wandb.watch(model)
for batch_idx, (data, target) in enumerate(train_loader):
...
if batch_idx % args.log_interval == 0:
# 4. Log metrics to visualize performance
wandb.log({"loss": loss})
Notes:
- call wandb.init and wandb.log only from the leader process
"""
from argparse import Namespace
from pathlib import Path
from typing import Union
import torch
from torch import nn
from torch.nn.functional import mse_loss
from torch.optim import Optimizer
import uutils
from uutils.torch_uu import r2_score_from_torch
from uutils.torch_uu.distributed import is_lead_worker
from uutils.torch_uu.models import get_simple_model
from uutils.torch_uu.tensorboard import log_2_tb_supervisedlearning
import wandb
def log_2_wandb_nice(it, loss, inputs, outputs, captions):
wandb.log({"loss": loss, "epoch": it,
"inputs": wandb.Image(inputs),
"logits": wandb.Histogram(outputs),
"captions": wandb.HTML(captions)})
def log_2_wandb(**metrics):
""" Log to wandb """
new_metrics: dict = {}
for key, value in metrics.items():
key = str(key).strip('_')
new_metrics[key] = value
wandb.log(new_metrics)
def log_train_val_stats(args: Namespace,
it: int,
train_loss: float,
train_acc: float,
valid,
log_freq: int = 10,
ckpt_freq: int = 50,
force_log: bool = False, # e.g. at the final it/epoch
save_val_ckpt: bool = False,
log_to_tb: bool = False,
log_to_wandb: bool = False
):
"""
log train and val stats.
Note: Unlike save ckpt, this one does need it to be passed explicitly (so it can save it in the stats collector).
"""
from uutils.torch_uu.tensorboard import log_2_tb
from matplotlib import pyplot as plt
# - is it epoch or iteration
it_or_epoch: str = 'epoch_num' if args.training_mode == 'epochs' else 'it'
# if its
total_its: int = args.num_empochs if args.training_mode == 'epochs' else args.num_its
print(f'-- {it == total_its - 1}')
print(f'-- {it}')
print(f'-- {total_its}')
if (it % log_freq == 0 or is_lead_worker(args.rank) or it == total_its - 1 or force_log) and is_lead_worker(args.rank):
print('inside log')
# - get eval stats
val_loss, val_acc = valid(args, args.mdl, save_val_ckpt=save_val_ckpt)
# - print
args.logger.log('\n')
args.logger.log(f"{it_or_epoch}={it}: {train_loss=}, {train_acc=}")
args.logger.log(f"{it_or_epoch}={it}: {val_loss=}, {val_acc=}")
# - record into stats collector
args.logger.record_train_stats_stats_collector(it, train_loss, train_acc)
args.logger.record_val_stats_stats_collector(it, val_loss, val_acc)
args.logger.save_experiment_stats_to_json_file()
fig = args.logger.save_current_plots_and_stats()
# - log to wandb
if log_to_wandb:
# if it == 0:
# # -- todo why isn't this working?
# wandb.watch(args.mdl)
# print('watching model')
# log_2_wandb(train_loss=train_loss, train_acc=train_acc)
print('inside wandb log')
wandb.log(data={'train loss': train_loss, 'train acc': train_acc, 'val loss': val_loss, 'val acc': val_acc}, step=it)
wandb.log(data={'it': it}, step=it)
if it == total_its - 1:
print(f'logging fig at {it=}')
wandb.log(data={'fig': fig}, step=it)
plt.close('all')
# - log to tensorboard
if log_to_tb:
log_2_tb_supervisedlearning(args.tb, args, it, train_loss, train_acc, 'train')
log_2_tb_supervisedlearning(args.tb, args, it, train_loss, train_acc, 'val')
# log_2_tb(args, it, val_loss, val_acc, 'train')
# log_2_tb(args, it, val_loss, val_acc, 'val')
# - log ckpt
if (it % ckpt_freq == 0 or it == total_its - 1 or force_log) and is_lead_worker(args.rank):
save_ckpt(args, args.mdl, args.optimizer)
def save_ckpt(args: Namespace, mdl: nn.Module, optimizer: torch.optim.Optimizer,
dirname: Union[None, Path] = None, ckpt_name: str = 'ckpt.pt'):
"""
Saves checkpoint for any worker.
Intended use is to save by worker that got a val loss that improved.
"""
import dill
dirname = args.log_root if (dirname is None) else dirname
# - pickle ckpt
assert uutils.xor(args.training_mode == 'epochs', args.training_mode == 'iterations')
pickable_args = uutils.make_args_pickable(args)
torch.save({'state_dict': mdl.state_dict(),
'epoch_num': args.epoch_num,
'it': args.it,
'optimizer': optimizer.state_dict(),
'args': pickable_args,
'mdl': mdl},
pickle_module=dill,
f=dirname / ckpt_name) # f'mdl_{epoch_num:03}.pt'
def get_args() -> Namespace:
args = uutils.parse_args_synth_agent()
# we can place model here...
args = uutils.setup_args_for_experiment(args)
return args
def valid_for_test(args: Namespace, mdl: nn.Module, save_val_ckpt: bool = False):
import torch
for t in range(1):
x = torch.randn(args.batch_size, 5)
y = (x ** 2 + x + 1).sum(dim=1)
y_pred = mdl(x).squeeze(dim=1)
val_loss, val_acc = mse_loss(y_pred, y), r2_score_from_torch(y_true=y, y_pred=y_pred)
if val_loss.item() < args.best_val_loss and save_val_ckpt:
args.best_val_loss = val_loss.item()
save_ckpt(args, args.mdl, args.optimizer, ckpt_name='ckpt_best_val.pt')
return val_loss, val_acc
def train_for_test(args: Namespace, mdl: nn.Module, optimizer: Optimizer, scheduler=None):
# wandb.watch(args.mdl)
for it in range(args.num_its):
x = torch.randn(args.batch_size, 5)
y = (x ** 2 + x + 1).sum(dim=1)
y_pred = mdl(x).squeeze(dim=1)
train_loss, train_acc = mse_loss(y_pred, y), r2_score_from_torch(y_true=y, y_pred=y_pred)
optimizer.zero_grad()
train_loss.backward() # each process synchronizes it's gradients in the backward pass
optimizer.step() # the right update is done since all procs have the right synced grads
scheduler.step()
log_train_val_stats(args, it, train_loss, train_acc, valid_for_test,
log_freq=2, ckpt_freq=10,
save_val_ckpt=True, log_to_tb=True, log_to_wandb=True)
return train_loss, train_acc
def debug_test():
args: Namespace = get_args()
args.num_its = 12
# - get mdl, opt, scheduler, etc
args.mdl = get_simple_model(in_features=5, hidden_features=20, out_features=1, num_layer=2)
wandb.watch(args.mdl)
args.optimizer = torch.optim.Adam(args.mdl.parameters(), lr=1e-1)
args.scheduler = torch.optim.lr_scheduler.ExponentialLR(args.optimizer, gamma=0.999, verbose=False)
# - train
train_loss, train_acc = train_for_test(args, args.mdl, args.optimizer, args.scheduler)
print(f'{train_loss=}, {train_loss=}')
# - eval
val_loss, val_acc = valid_for_test(args, args.mdl)
print(f'{val_loss=}, {val_acc=}')
# - make sure wandb closes properly
if args.log_to_wandb:
wandb.finish()
if __name__ == '__main__':
import os
# print(os.environ['WANDB_API_KEY'])
import time
start = time.time()
debug_test()
duration_secs = time.time() - start
print(f"\nSuccess, time passed: hours:{duration_secs / (60 ** 2)}, minutes={duration_secs / 60}, seconds={duration_secs}")
print('Done!\a')
code in github: https://github.com/brando90/ultimate-utils/blob/master/tutorials_for_myself/my_wandb/my_wandb_basic1.py
sample run: https://wandb.ai/brando/playground/runs/wpupxvg1
cross posted: https://community.wandb.ai/t/when-is-one-supposed-to-run-wandb-watch-so-that-weights-and-biases-tracks-params-and-gradients-prope/518
Cross posting an answer by charlesfrye in the wandb community forum:
There are two things you might be running into here -- can't confirm because your code relies on the ultimate-utils package.
wandb.watch will only start working once you call wandb.log after a backwards pass that touches the watched Module (docs).
The frequency with which gradients/params are logged is controlled by the log_freq argument. If the number of logging calls is less than the value of log_freq, then no information will be logged. Here's a short colab reproducing this behavior.
Also, if you want params and gradients, you need to set the log kwarg to "all". By default, we log only gradients.
I don't know why but this line of code seems to work:
wandb.watch(args.mdl, mse_loss, log="all", log_freq=10)
perhaps it really needs the loss and the log all despite it not being in the intro/quick start guide:
import wandb
# 1. Start a new run
wandb.init(project='playground', entity='brando')
# 2. Save model inputs and hyperparameters
config = wandb.config
config.learning_rate = 0.01
# 3. Log gradients and model parameters
wandb.watch(model)
for batch_idx, (data, target) in enumerate(train_loader):
...
if batch_idx % args.log_interval == 0:
# 4. Log metrics to visualize performance
wandb.log({"loss": loss})
I want to implement a (meta) trainable step size. I tried it with this post:
https://discuss.pytorch.org/t/how-does-one-have-the-parameters-of-a-model-not-be-leafs/70076/17
and with the higher library (https://github.com/facebookresearch/higher) with no luck...
I tried:
eta = torch.tensor([0.5], requires_grad=True).view(1)
inner_opt = torch.optim.Adam(child_model.parameters(), lr=eta)
#meta_params = itertools.chain(child_model.parameters(),eta.parameters())
meta_params = itertools.chain(child_model.parameters())
meta_opt = torch.optim.Adam(meta_params, lr=1e-3)
# do meta-training/outer training minimize outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
nb_outer_steps = 10 # note, in this case it's the same as number of meta-train steps (but it's could not be the same depending how you loop through the val set)
for outer_i, (outer_inputs, outer_targets) in enumerate(testloader, 0):
meta_opt.zero_grad()
if outer_i >= nb_outer_steps:
break
# do inner-training/MAML; minimize innerloop: theta^{T} - eta* Grad L^train(theta^{T}) ~ argmin L^train(theta)
nb_inner_steps = 3
with higher.innerloop_ctx(child_model, inner_opt) as (fmodel, diffopt):
with error:
Exception has occurred: RuntimeError
Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
which wouldn't work anyway cuz eta might become negative suddenly so I really want to cap it with a sigmoid function but had to try something...
It thinks my step size NN is not in the graph but it is because of this line of code:
p_new = p + lr*g
group['params'][p_idx] = p_new
but somehow that is not enough to have gradients...
Full script self contained script:
import torch
import torch.nn as nn
from torch.optim.optimizer import Optimizer
import higher
from higher.optim import DifferentiableOptimizer
from higher.optim import DifferentiableSGD
import torchvision
import torchvision.transforms as transforms
from torchviz import make_dot
import copy
import itertools
from collections import OrderedDict
#mini class to add a flatten layer to the ordered dictionary
class Flatten(nn.Module):
def forward(self, input):
'''
Note that input.size(0) is usually the batch size.
So what it does is that given any input with input.size(0) # of batches,
will flatten to be 1 * nb_elements.
'''
batch_size = input.size(0)
out = input.view(batch_size,-1)
return out # (batch_size, *size)
def get_cifar10():
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
shuffle=False, num_workers=2)
return trainloader, testloader
class MySGD(Optimizer):
def __init__(self, params, eta, prev_lr):
defaults = {'eta':eta, 'prev_lr':prev_lr}
super().__init__(params, defaults)
class TrainableSGD(DifferentiableOptimizer):
def _update(self, grouped_grads, **kwargs):
prev_lr = self.param_groups[0]['prev_lr']
eta = self.param_groups[0]['eta']
# start differentiable & trainable update
zipped = zip(self.param_groups, grouped_grads)
lr = 0.1*eta(prev_lr).view(1)
for group_idx, (group, grads) in enumerate(zipped):
for p_idx, (p, g) in enumerate(zip(group['params'], grads)):
if g is None:
continue
#group['params'][p_idx] = _add(p, -group['lr'], g)
p_new = p + lr*g
group['params'][p_idx] = p_new
# fake returns
self.param_groups[0]['prev_lr'] = lr
higher.register_optim(MySGD, TrainableSGD)
def main():
# get dataloaders
trainloader, testloader = get_cifar10()
criterion = nn.CrossEntropyLoss()
child_model = nn.Sequential(OrderedDict([
('conv1', nn.Conv2d(in_channels=3,out_channels=2,kernel_size=5)),
('relu1', nn.ReLU()),
('Flatten', Flatten()),
('fc', nn.Linear(in_features=28*28*2,out_features=10) )
]))
hidden = torch.randn(size=(1,1),requires_grad=True)
print(f'-> hidden = {hidden}')
eta = nn.Sequential(OrderedDict([
('fc', nn.Linear(1,1)),
('sigmoid', nn.Sigmoid())
]))
inner_opt = MySGD(child_model.parameters(), eta=eta, prev_lr=hidden)
meta_params = itertools.chain(child_model.parameters(),eta.parameters())
#meta_params = itertools.chain(eta.parameters(),[hidden])
meta_opt = torch.optim.Adam(meta_params, lr=1e-3)
# do meta-training/outer training minimize outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
print()
nb_outer_steps = 1 # note, in this case it's the same as number of meta-train steps (but it's could not be the same depending how you loop through the val set)
for outer_i, (outer_inputs, outer_targets) in enumerate(testloader, 0):
meta_opt.zero_grad()
if outer_i >= nb_outer_steps:
break
# do inner-training/MAML; minimize innerloop: theta^{T} - eta * Grad L^train(theta^{T}) ~ argmin L^train(theta)
nb_inner_steps = 3
#with higher.innerloop_ctx(child_model, inner_opt, copy_initial_weights=False) as (fmodel, diffopt):
with higher.innerloop_ctx(child_model, inner_opt) as (fmodel, diffopt):
for inner_i, (inner_inputs, inner_targets) in enumerate(trainloader, 0):
if inner_i >= nb_inner_steps:
break
logits = fmodel(inner_inputs)
inner_loss = criterion(logits, inner_targets)
print(f'--> inner_i = {inner_i}')
print(f'inner_loss^<{inner_i}>: {inner_loss}')
print(f'lr^<{inner_i-1}> = {diffopt.param_groups[0]["prev_lr"]}')
diffopt.step(inner_loss) # changes params P[t+1] using P[t] and loss[t] in a differentiable manner
print(f'lr^<{inner_i}> = {diffopt.param_groups[0]["prev_lr"]}')
print()
# compute the meta-loss L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
outer_outputs = fmodel(outer_inputs)
meta_loss = criterion(outer_outputs, outer_targets) # L^val
make_dot(meta_loss).render('meta_loss',format='png')
meta_loss.backward()
#grad_of_grads = torch.autograd.grad(outputs=meta_loss, inputs=eta.parameters()) # dmeta_loss/dw0
print(f'----> outer_i = {outer_i}')
print(f'-> outer_loss/meta_loss^<{outer_i}>: {meta_loss}')
print(f'child_model.fc.weight.grad = {child_model.fc.weight.grad}')
print(f'hidden.grad = {hidden.grad}')
print(f'eta.fc.weight = {eta.fc.weight.grad}')
meta_opt.step() # meta-optimizer step: more or less theta^<t> := theta^<t> - meta_eta * Grad L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
if __name__ == "__main__":
main()
print('---> Done\a')
notice the None's:
Files already downloaded and verifiedFiles already downloaded and verified
-> hidden = tensor([[0.8459]], requires_grad=True)
--> inner_i = 0
inner_loss^<0>: 2.2696359157562256
lr^<-1> = tensor([[0.8459]], requires_grad=True)
lr^<0> = tensor([0.0567], grad_fn=)
--> inner_i = 1
inner_loss^<1>: 2.0114920139312744
lr^<0> = tensor([0.0567], grad_fn=)
lr^<1> = tensor([0.0720], grad_fn=)
--> inner_i = 2
inner_loss^<2>: 2.3866422176361084
lr^<1> = tensor([0.0720], grad_fn=)
lr^<2> = tensor([0.0717], grad_fn=)
----> outer_i = 0
-> outer_loss/meta_loss^<0>: 4.021303176879883
child_model.fc.weight.grad = None
hidden.grad = None
eta.fc.weight = None
---> Done
related:
pytorch forum: https://discuss.pytorch.org/t/implement-a-meta-trainable-step-size/70396
gitissue: https://github.com/facebookresearch/higher/issues/32
related SO Q: How does one have parameters in a pytorch model not be leafs and be in the computation graph?
I am currently learning deep learning especially GAN.
I found a simple code of GAN from a web site below.
https://medium.com/#devnag/generative-adversarial-networks-gans-in-50-lines-of-code-pytorch-e81b79659e3f
However, in the code, I don't understand why we always need to give true label to Generator as below.
for g_index in range(g_steps):
# 2. Train G on D's response (but DO NOT train D on these labels)
G.zero_grad()
gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
g_fake_data = G(gen_input)
dg_fake_decision = D(preprocess(g_fake_data.t()))
g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine
g_error.backward()
g_optimizer.step() # Only optimizes G's parameters
Specifically, on this line.
g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine
Input data for Generator is fake data(includes noise), so if we assign True labels on those input data, I think Generator ends up creating data which is similar to fake data(doesn't look like genuine). Is my understanding wrong? Sorry for the silly question, but if you have knowledge, plz help me out.
I'll put a whole code below.
#!/usr/bin/env python
# Generative Adversarial Networks (GAN) example in PyTorch.
# See related blog post at https://medium.com/#devnag/generative-adversarial-networks-gans-in-50-lines-of-code-pytorch-e81b79659e3f#.sch4xgsa9
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
# Data params
data_mean = 4
data_stddev = 1.25
# Model params
g_input_size = 1 # Random noise dimension coming into generator, per output vector
g_hidden_size = 50 # Generator complexity
g_output_size = 1 # size of generated output vector
d_input_size = 100 # Minibatch size - cardinality of distributions
d_hidden_size = 50 # Discriminator complexity
d_output_size = 1 # Single dimension for 'real' vs. 'fake'
minibatch_size = d_input_size
d_learning_rate = 2e-4 # 2e-4
g_learning_rate = 2e-4
optim_betas = (0.9, 0.999)
num_epochs = 30000
print_interval = 200
d_steps = 1 # 'k' steps in the original GAN paper. Can put the discriminator on higher training freq than generator
g_steps = 1
# ### Uncomment only one of these
#(name, preprocess, d_input_func) = ("Raw data", lambda data: data, lambda x: x)
(name, preprocess, d_input_func) = ("Data and variances", lambda data: decorate_with_diffs(data, 2.0), lambda x: x * 2)
print("Using data [%s]" % (name))
# ##### DATA: Target data and generator input data
def get_distribution_sampler(mu, sigma):
return lambda n: torch.Tensor(np.random.normal(mu, sigma, (1, n))) # Gaussian
def get_generator_input_sampler():
return lambda m, n: torch.rand(m, n) # Uniform-dist data into generator, _NOT_ Gaussian
# ##### MODELS: Generator model and discriminator model
class Generator(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Generator, self).__init__()
self.map1 = nn.Linear(input_size, hidden_size)
self.map2 = nn.Linear(hidden_size, hidden_size)
self.map3 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.elu(self.map1(x))
x = F.sigmoid(self.map2(x))
return self.map3(x)
class Discriminator(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Discriminator, self).__init__()
self.map1 = nn.Linear(input_size, hidden_size)
self.map2 = nn.Linear(hidden_size, hidden_size)
self.map3 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.elu(self.map1(x))
x = F.elu(self.map2(x))
return F.sigmoid(self.map3(x))
def extract(v):
return v.data.storage().tolist()
def stats(d):
return [np.mean(d), np.std(d)]
def decorate_with_diffs(data, exponent):
mean = torch.mean(data.data, 1, keepdim=True)
mean_broadcast = torch.mul(torch.ones(data.size()), mean.tolist()[0][0])
diffs = torch.pow(data - Variable(mean_broadcast), exponent)
return torch.cat([data, diffs], 1)
d_sampler = get_distribution_sampler(data_mean, data_stddev)
gi_sampler = get_generator_input_sampler()
G = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size)
D = Discriminator(input_size=d_input_func(d_input_size), hidden_size=d_hidden_size, output_size=d_output_size)
criterion = nn.BCELoss() # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss
d_optimizer = optim.Adam(D.parameters(), lr=d_learning_rate, betas=optim_betas)
g_optimizer = optim.Adam(G.parameters(), lr=g_learning_rate, betas=optim_betas)
for epoch in range(num_epochs):
for d_index in range(d_steps):
# 1. Train D on real+fake
D.zero_grad()
# 1A: Train D on real
d_real_data = Variable(d_sampler(d_input_size))
d_real_decision = D(preprocess(d_real_data))
d_real_error = criterion(d_real_decision, Variable(torch.ones(1))) # ones = true
d_real_error.backward() # compute/store gradients, but don't change params
# 1B: Train D on fake
d_gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
d_fake_data = G(d_gen_input).detach() # detach to avoid training G on these labels
d_fake_decision = D(preprocess(d_fake_data.t()))
d_fake_error = criterion(d_fake_decision, Variable(torch.zeros(1))) # zeros = fake
d_fake_error.backward()
d_optimizer.step() # Only optimizes D's parameters; changes based on stored gradients from backward()
for g_index in range(g_steps):
# 2. Train G on D's response (but DO NOT train D on these labels)
G.zero_grad()
gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
g_fake_data = G(gen_input)
dg_fake_decision = D(preprocess(g_fake_data.t()))
g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine
g_error.backward()
g_optimizer.step() # Only optimizes G's parameters
if epoch % print_interval == 0:
print("%s: D: %s/%s G: %s (Real: %s, Fake: %s) " % (epoch,
extract(d_real_error)[0],
extract(d_fake_error)[0],
extract(g_error)[0],
stats(extract(d_real_data)),
stats(extract(d_fake_data))))
In this part of the code you are training G to fool D, so G generates fake data and asks D whether it thinks it's real (true labels), D's gradients are then propogated all the way to G (this is possible as D's input was G's output) so that it will learn to better fool D in the next iteration.
The inputs of G are not trainable and G only tries to transform them into real data (data similar to what d_sampler generates)
Is there an example of sparse autoencoders in tensorflow? I was to able to run and understand the normal one from here https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/autoencoder.py
For sparse, do I just need to modify the cost function?
from __future__ import division, print_function, absolute_import
import scipy.fftpack
import pdb, random
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from read_audio import read_audio
start,end= 3050,5723
#start=end=None
audio_data=read_audio("LDC93S1",start,end)
def overlapping_chunks(l, sub_array_size, overlap_size):
return [l[i:i+sub_array_size] for i in range(0, len(l)-overlap_size, overlap_size)]
def conv_frq_domain(signal):
fft_abs=abs(scipy.fft(signal))
fft_sorted=np.sort(fft_abs)[::-1]
top_100=fft_sorted[:100]
return top_100
sample_len=100
samples=overlapping_chunks(audio_data,sample_len,50)
freq_samples=[]
for sample in samples:
freq_samples.append(conv_frq_domain(sample))
examples=samples
print("Number of samples", str(len(examples)))
#pdb.set_trace()
# Parameters
learning_rate = 0.001
training_epochs = 2000
batch_size = 2
display_step = 100
# Network Parameters
n_hidden_1 = 1000 # 1st layer num features
n_hidden_2 = 650 # 2nd layer num features
n_input = sample_len
# tf Graph input (only pictures)
X = tf.placeholder("float", [None, n_input])
weights = {
'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'decoder_h1': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_1])),
'decoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),
'decoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'decoder_b2': tf.Variable(tf.random_normal([n_input])),
}
# Building the encoder
def encoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
biases['encoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
biases['encoder_b2']))
return layer_1,layer_2
# Building the decoder
def decoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
biases['decoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.add(tf.matmul(layer_1, weights['decoder_h2']),
biases['decoder_b2'])
return layer_2
def kl_divergence(p_1, p_hat):
num_len=p_1.get_shape()[0]
term1 = p_1 * tf.log(p_1)
term2 = p_1 * tf.log(p_hat)
term3 = tf.sub(tf.ones(num_len),p_1) * tf.log(tf.sub(tf.ones(num_len),p_1))
term4 = tf.sub(tf.ones(num_len),p_1) * tf.log(tf.sub(tf.ones(num_len) ,p_hat))
return tf.sub(tf.add(term1,term3),tf.add(term2,term4))
def sparsity_penalty(hidden_layer_acts, sparsity_level=0.05, sparse_reg=1e-3, batch_size=-1):
# = T.extra_ops.repeat(sparsity_level, self.nhid)
sparsity_level_vec=tf.ones(hidden_layer_acts.get_shape()[1])*sparsity_level
sparsity_penalty = 0
avg_act = Mean = tf.reduce_mean(hidden_layer_acts,1)
kl_div = kl_divergence(sparsity_level_vec, avg_act)
sparsity_penalty = sparse_reg * tf.reduce_sum(kl_div,0)
return sparsity_penalty
# Construct model
encoder_op1, encoder_op2 = encoder(X)
decoder_op = decoder(encoder_op2)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X
# Define loss and optimizer, minimize the squared error
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2)+sparsity_penalty(encoder_op2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
train_data=examples
with tf.Session() as sess:
sess.run(init)
total_batch = int(len(examples)/batch_size)
# Training cycle
for epoch in range(training_epochs):
#random.shuffle(train_data)
# Loop over all batches
for i in range(total_batch):
batch_xs=train_data[i:i+2]
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={X: batch_xs})
if epoch ==2500:
pdb.set_trace()
encode_decode = sess.run(y_pred, feed_dict={X: batch_xs})
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1),"cost=", "{:.9f}".format(c))
print("Optimization Finished!")