Related
I was trying to fine tune BERT base uncased on a small dataset of 1.5k fields which is quiet less however while running
trainer.fit(model, data_module)
when it goes to the 'model' for training which is:
class ElectionTagger(pl.LightningModule):
def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
super().__init__()
self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
self.n_training_steps = n_training_steps
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.BCELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def training_epoch_end(self, outputs):
labels = []
predictions = []
for output in outputs:
for out_labels in output["labels"].detach().cpu():
labels.append(out_labels)
for out_predictions in output["predictions"].detach().cpu():
predictions.append(out_predictions)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
for i, name in enumerate(LABEL_COLUMNS):
class_roc_auc = auroc(predictions[:, i], labels[:, i]) ##### ERROR ARISES HERE###
self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps
)
return dict(
optimizer=optimizer,
lr_scheduler=dict(
scheduler=scheduler,
interval='step'
)
)
Instead of implementing, I got an error which says
TypeError: auroc() missing 1 required positional argument: 'task'
It would be great if anyone could provide a solution to this.
in case of using auroc() function, you need to set the task argument to either 'binary', 'multiclass' or 'multilabel'. Or, you can use directly one of the binary_auroc(), multiclass_auroc() or multilabel_auroc() instead of auroc(). See docs for more information.
I'm trying to train source code.
class mymodel(pl.LightningModule):
def __init__(self, config , learning_rate = 1e-4, max_steps = 100000//2):
super(mymodel, self).__init__()
self.config = config
self.save_hyperparameters()
self.training_losses = []
self.validation_losses = []
self.max_steps = max_steps
def configure_optimizers(self):
return torch.optim.AdamW(self.parameters(), lr = self.hparams['learning_rate'])
def forward(self, batch_dict):
return answer_vector
def calculate_metrics(self, prediction, labels):
batch_size = len(prediction)
ac_score = 0
for (pred, gt) in zip(prediction, labels):
ac_score+= calculate_acc_score(pred.detach().cpu(), gt.detach().cpu())
ac_score = ac_score/batch_size
return ac_score
def training_step(self, batch, batch_idx):
answer_vector = self.forward(batch)
loss = nn.CrossEntropyLoss()(answer_vector.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(answer_vector, dim = -1)
train_acc = self.calculate_metrics(preds, batch['answer'])
train_acc = torch.tensor(train_acc)
return loss
def validation_step(self, batch, batch_idx):
logits = self.forward(batch)
loss = nn.CrossEntropyLoss()(logits.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(logits, dim = -1)
## Validation Accuracy
val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
val_acc = torch.tensor(val_acc)
## Logging
self.log('val_ce_loss', loss, prog_bar = True)
self.log('val_acc', val_acc, prog_bar = True)
return {'val_loss': loss, 'val_acc': val_acc}
def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure = None, on_tpu=False,
using_native_amp=False, using_lbfgs=False):
## Warmup for 1000 steps
if self.trainer.global_step < 1000:
lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
for pg in optimizer.param_groups:
pg['lr'] = lr_scale * self.hparams.learning_rate
## Linear Decay
else:
for pg in optimizer.param_groups:
pg['lr'] = polynomial(self.hparams.learning_rate, self.trainer.global_step, max_iter = self.max_steps)
optimizer.step(opt_closure)
optimizer.zero_grad()
In 5'th epoch (maybe less or more) I encountered error that stop training. so I increase max_step . But when I increase max_step(max_step==100K) I have this problem with loss and acc that
loss>100 && acc==0 .I attach screen of this problem.
enter image description here
What changing I should do in source code to continue training model without this problem?
Updates:
I see. It looks like your optimizer_step is actually for "scheduler," where it messes with the AdamW learning rate. You should directly apply the scheduler to configure_optimizers function. See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html?highlight=configure_optimizers#configure-optimizers
===
old answer:
By error, do you mean the line of val_ce_loss? If yes, that's not an error. It means the val_ce_loss of the current epoch is not within the top1 of historical epochs so the checkpoint won't be saved to the disk. Please refer to the argument of save_top_k in the checkpoint callback. https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html
I am using MLflow to log the metrics but I want to change the default saving logs directory. So, instead of writing log files besides my main file, I want to store them to /path/outputs/lg . I don't know how to change it. I use it without in the Model.
import os
from time import time
import mlflow
import numpy as np
import torch
import tqdm
# from segmentation_models_pytorch.utils import metrics
from AICore.emergency_landing.metrics import IoU, F1
from AICore.emergency_landing.utils import AverageMeter
from AICore.emergency_landing.utils import TBLogger
class Model:
def __init__(self, model, num_classes=5, ignore_index=0, optimizer=None, scheduler=None, criterion=None,
device=None, epochs=30, train_loader=None, val_loader=None, tb_logger: TBLogger = None,
logger=None,
best_model_path=None,
model_check_point_path=None,
load_from_best_model=None,
load_from_model_checkpoint=None,
early_stopping=None,
debug=False):
self.debug = debug
self.early_stopping = {
'init': early_stopping,
'changed': 0
}
self.optimizer = optimizer
self.scheduler = scheduler
self.criterion = criterion
self.device = device
self.epochs = epochs
self.train_loader = train_loader
self.val_loader = val_loader
self.model = model.to(device)
self.tb_logger = tb_logger
self.logger = logger
self.best_loss = np.Inf
if not os.path.exists(best_model_path):
os.makedirs(best_model_path)
self.best_model_path = best_model_path
if not os.path.exists(model_check_point_path):
os.makedirs(model_check_point_path)
self.model_check_point_path = model_check_point_path
self.load_from_best_model = load_from_best_model
self.load_from_model_checkpoint = load_from_model_checkpoint
if self.load_from_best_model is not None:
self.load_model(path=self.load_from_best_model)
if self.load_from_model_checkpoint is not None:
self.load_model_checkpoint(path=self.load_from_model_checkpoint)
self.train_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.val_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.test_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.train_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.val_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.test_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
def metrics(self, is_train=True):
if is_train:
train_losses = AverageMeter('Training Loss', ':.4e')
train_iou = AverageMeter('Training iou', ':6.2f')
train_f_score = AverageMeter('Training F_score', ':6.2f')
return train_losses, train_iou, train_f_score
else:
val_losses = AverageMeter('Validation Loss', ':.4e')
val_iou = AverageMeter('Validation mean iou', ':6.2f')
val_f_score = AverageMeter('Validation F_score', ':6.2f')
return val_losses, val_iou, val_f_score
def fit(self):
self.logger.info("\nStart training\n\n")
start_training_time = time()
with mlflow.start_run():
for e in range(self.epochs):
start_training_epoch_time = time()
self.model.train()
train_losses_avg, train_iou_avg, train_f_score_avg = self.metrics(is_train=True)
with tqdm.tqdm(self.train_loader, unit="batch") as tepoch:
tepoch.set_description(f"Epoch {e}")
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# Clear the gradients
self.optimizer.zero_grad()
# Forward Pass
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
train_losses_avg.update(loss.item(), image.size(0))
# Calculate gradients
loss.backward()
# Update Weights
self.optimizer.step()
iou = self.train_iou(out.cpu(), target.cpu()).item()
train_iou_avg.update(iou)
f1_score = self.train_f1(out.cpu(), target.cpu()).item()
train_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=train_losses_avg.avg,
iou=train_iou_avg.avg,
f_score=train_f_score_avg.avg)
if self.debug:
break
self.tb_logger.log(log_type='criterion/training', value=train_losses_avg.avg, epoch=e)
self.tb_logger.log(log_type='iou/training', value=train_iou_avg.avg, epoch=e)
self.tb_logger.log(log_type='f_score/training', value=train_f_score_avg.avg, epoch=e)
mlflow.log_metric('criterion/training', train_losses_avg.avg, step=e)
mlflow.log_metric('iou/training', train_iou_avg.avg, step=e)
mlflow.log_metric('f_score/training', train_f_score_avg.avg, step=e)
end_training_epoch_time = time() - start_training_epoch_time
print('\n')
self.logger.info(
f'Training Results - [{end_training_epoch_time:.3f}s] Epoch: {e}:'
f' f_score: {train_f_score_avg.avg:.3f},'
f' IoU: {train_iou_avg.avg:.3f},'
f' Loss: {train_losses_avg.avg:.3f}')
# validation step
val_loss = self.evaluation(e)
# apply scheduler
if self.scheduler:
self.scheduler.step()
# early stopping
if self.early_stopping['init'] >= self.early_stopping['changed']:
self._early_stopping_model(val_loss=val_loss)
else:
print(f'The model can not learn more, Early Stopping at epoch[{e}]')
break
# save best model
if self.best_model_path is not None:
self._best_model(val_loss=val_loss, path=self.best_model_path)
# model check points
if self.model_check_point_path is not None:
self.save_model_check_points(path=self.model_check_point_path, epoch=e, net=self.model,
optimizer=self.optimizer, loss=self.criterion,
avg_loss=train_losses_avg.avg)
# log mlflow
if self.scheduler:
mlflow.log_param("get_last_lr", self.scheduler.get_last_lr())
mlflow.log_param("scheduler", self.scheduler.state_dict())
self.tb_logger.flush()
if self.debug:
break
end_training_time = time() - start_training_time
print(f'Finished Training after {end_training_time:.3f}s')
self.tb_logger.close()
def evaluation(self, epoch):
print('Validating...')
start_validation_epoch_time = time()
self.model.eval() # Optional when not using Model Specific layer
with torch.no_grad():
val_losses_avg, val_iou_avg, val_f_score_avg = self.metrics(is_train=False)
with tqdm.tqdm(self.val_loader, unit="batch") as tepoch:
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
val_losses_avg.update(loss.item(), image.size(0))
iou = self.val_iou(out.cpu(), target.cpu()).item()
val_iou_avg.update(iou)
f1_score = self.val_f1(out.cpu(), target.cpu()).item()
val_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=val_losses_avg.avg,
iou=val_iou_avg.avg,
f_score=val_f_score_avg.avg)
if self.debug:
break
print('\n')
self.tb_logger.log(log_type='criterion/validation', value=val_losses_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='iou/validation', value=val_iou_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='f_score/validation', value=val_f_score_avg.avg, epoch=epoch)
mlflow.log_metric('criterion/validation', val_losses_avg.avg, step=epoch)
mlflow.log_metric('iou/validation', val_iou_avg.avg, step=epoch)
mlflow.log_metric('f_score/validation', val_f_score_avg.avg, step=epoch)
end_validation_epoch_time = time() - start_validation_epoch_time
self.logger.info(
f'validation Results - [{end_validation_epoch_time:.3f}s] Epoch: {epoch}:'
f' f_score: {val_f_score_avg.avg:.3f},'
f' IoU: {val_iou_avg.avg:.3f},'
f' Loss: {val_losses_avg.avg:.3f}')
print('\n')
return val_losses_avg.avg
def _save_model(self, name, path, params):
torch.save(params, path)
def _early_stopping_model(self, val_loss):
if self.best_loss < val_loss:
self.early_stopping['changed'] += 1
else:
self.early_stopping['changed'] = 0
def _best_model(self, val_loss, path):
if self.best_loss > val_loss:
self.best_loss = val_loss
name = f'/best_model_loss_{self.best_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'model_state_dict': self.model.state_dict(),
})
print(f'The best model is saved with criterion: {self.best_loss:.2f}')
def save_model_check_points(self, path, epoch, net, optimizer, loss, avg_loss):
name = f'/model_epoch_{epoch}_loss_{avg_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'epoch': epoch,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'criterion': loss,
})
print(f'model checkpoint is saved at model_epoch_{epoch}_loss_{avg_loss:.2f}')
def load_model_checkpoint(self, path):
checkpoint = torch.load(path)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
self.criterion = checkpoint['criterion']
return epoch
def load_model(self, path):
best_model = torch.load(path)
self.model.load_state_dict(best_model['model_state_dict'])
The solution is:
mlflow.set_tracking_uri(uri=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
exp = mlflow.get_experiment_by_name(name='Emegency_landing')
if not exp:
experiment_id = mlflow.create_experiment(name='Emegency_landing',
artifact_location=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
else:
experiment_id = exp.experiment_id
And then you should pass the experiment Id to:
with mlflow.start_run(experiment_id=experiment_id):
pass
If you don't mention the /path/mlruns, when you run the command of mlflow ui, it will create another folder automatically named mlruns. so, pay attention to this point to have the same name as mlruns.
I am currently trying train a CNN using PyTorch to predict a subject's age. The age group ranges from 0 to 116. I used the same model to train it on gender classification with two options: male or female.
I ported the same code for the age classification, I was getting errors. The error was due to our last fully connected layer not return a large enough output (in terms of matrix size, it was initially returning a 50 x 2 matrix due to our gender classifier but I switched it to 50 x 117 for the age classification based on the total age options.)
My issue now is that the training loop prints epochs with a huge loss (~3.5 while before, when training the gender classification, it was sub zero.)
Below is my code:
DataLoader class:
class MyDataset(Dataset):
def __init__(self, root_directory, csv_file, image_path, transform = None):
annotated_path = os.path.relpath(csv_file) # Path to UTKFace Dataset and Annotations
self.read_in_csv = pd.read_csv(annotated_path, index_col=False)
self.image_path = os.path.join(root_directory, image_path)
self.transform = transform
self.labels = np.asarray(self.read_in_csv.loc[:,'age'])
def __getitem__(self, index):
attr = self.labels[index]
image_name = str(self.read_in_csv.loc[index, 'file'])
image = Image.open(image_name)
if self.transform:
image = self.transform(image)
dict = {'image':image, 'label':attr}
return dict
def __len__(self):
return len(self.read_in_csv.index)
CNN Architecture:
class ConvolutionalNN(nn.Module):
def __init__(self):
super(ConvolutionalNN,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3,96,kernel_size=7,stride=4),
nn.BatchNorm2d(96), # Number of Features
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(96,256,kernel_size=5,padding=2),
nn.BatchNorm2d(256),
nn.ReLU(), # Default = False
nn.MaxPool2d(kernel_size=3,stride=2))
self.layer3 = nn.Sequential(
nn.Conv2d(256,384,kernel_size=3,padding=1),
nn.BatchNorm2d(384),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2))
self.fc1 = nn.Linear(384*6*6,512)
self.fc2 = nn.Linear(512,512)
self.fc3 = nn.Linear(512,117)
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
#print out.size()
out = F.dropout(F.relu(self.fc1(out)))
out = F.dropout(F.relu(self.fc2(out)))
out = self.fc3(out)
return out
Training Loop:
def training_loop(checkpoint = None, best=False):
current_epoch = 1
num_epochs = 50
train_acc_history = []
val_acc_history = []
epoch_history = []
learning_rate = 0.001
best_val_acc = 0.0
is_best = False
criterion = nn.CrossEntropyLoss()
## Predict the Age and Gender of the Human in the Image
optimizer = torch.optim.SGD(cnn.parameters(),lr=0.001,momentum=0.9)
if checkpoint is not None:
is_best = best
current_epoch = checkpoint['epoch']
train_acc_history = checkpoint['train_acc_history']
val_acc_history = checkpoint['val_acc_history']
best_val_acc = checkpoint['best_val_acc']
optimizer.load_state_dict(checkpoint['optimizer'])
epoch_history = checkpoint['epoch_history']
print('Uploading our images now...')
for epoch in range(current_epoch, num_epochs + current_epoch):
print('Starting epoch %d / %d' % (epoch + 1, num_epochs + current_epoch))
print('Learning Rate for this epoch: {}'.format(learning_rate))
for i, batch in enumerate(train_loader):
images, labels = batch['image'], batch['label']
images = images.clone().detach()
labels = labels.clone().detach()
if use_gpu:
images = images.cuda()
labels = labels.cuda()
optimizer.zero_grad()
pred_labels = cnn(images)
loss = criterion(pred_labels,labels)
loss.backward()
optimizer.step()
So this is my code. It does not seem to be training well.
Please let me know on what could be done to fix this.
So I tried implementing a Convolutional Neural Network on MNIST dataset in a similar fashion as this: https://github.com/tensorflow/tensorflow/blob/r1.1/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
However, on doing that, I noticed that for some reason my second max_pool is not happening. Also, I don't understand how the code in the above link works, more specifically, how the nn_layer method can be reused as the weights exist only in that scope and calling it twice would change them?
My code:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
from tensorflow.contrib.tensorboard.plugins import projector
current_path = os.path.dirname(os.path.realpath(__file__))
current_path = current_path+"/logs"
def train():
mnist = input_data.read_data_sets("MNIST_data", one_hot = True)
def initializer(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def conv2d(x,W):
return tf.nn.conv2d(x , W , [1,1,1,1] , padding="SAME")
def max_pool(x):
return tf.nn.max_pool(x , [1,2,2,1] , [1,2,2,1] , padding="SAME")
def conv_layer(x,length,width,input_channels,output_channels,layer_name,act=tf.nn.relu):
with tf.name_scope(layer_name):
with tf.name_scope('weights'):
weights = initializer([length,width,input_channels,output_channels])
tf.summary.histogram(layer_name+"_weights",weights)
with tf.name_scope('biases'):
biases = initializer([output_channels])
tf.summary.histogram(layer_name+"_biases",biases)
with tf.name_scope('activations'):
activations = act(conv2d(x,weights) + biases)
activations = max_pool(activations)
tf.summary.histogram(layer_name+"_activations",activations)
return activations
def dense_layer(x,input_size,output_size,layer_name,act=tf.nn.relu):
with tf.name_scope(layer_name):
with tf.name_scope('weights'):
weights = initializer([input_size,output_size])
tf.summary.histogram(layer_name+"_weights",weights)
with tf.name_scope('biases'):
biases = initializer([output_size])
tf.summary.histogram(layer_name+"_biases",biases)
with tf.name_scope('activations'):
activations = act(tf.matmul(x,weights) + biases)
tf.summary.histogram(layer_name+"_activations",activations)
return activations
def dropout(x,keep_prob):
with tf.name_scope('Dropout'):
dropped =tf.nn.dropout(x,keep_prob)
return dropped
with tf.name_scope('input'):
x = tf.placeholder(tf.float32, [None,784],name='image_inputs')
y = tf.placeholder(tf.float32, [None,10],name='image_labels')
keep_prob = tf.placeholder(tf.float32,name='keep_probability')
with tf.name_scope('input_reshape'):
x_image = tf.reshape(x , [-1,28,28,1])
tf.summary.image('input',x_image,50)
h1 = conv_layer(x_image,3,3,1,32,"first_convolution_layer")
h2 = conv_layer(h1,3,3,32,64,"second_convolution_layer")
h2 = tf.reshape(h1,[-1,7*7*64])
h2 = dropout(h2,keep_prob)
h3 = dense_layer(h2,7*7*64,1024,"first_dense_layer")
h3 = dropout(h3,keep_prob)
h4 = dense_layer(h3,1024,1024,"second_dense_layer")
h4 = dropout(h4,keep_prob)
h_out = dense_layer(h4,1024,10,"output_dense_layer",act=tf.nn.sigmoid)
with tf.name_scope("Loss"):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=h_out))
tf.summary.scalar('Loss',cost)
train = tf.train.AdamOptimizer().minimize(cost)
with tf.name_scope("Accuracy"):
correct_pred = tf.equal(tf.argmax(h_out, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
summary = tf.summary.merge_all()
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(current_path, sess.graph)
for i in range(500):
batch = mnist.train.next_batch(500)
if(i%100 == 0):
summary_str = sess.run(summary,feed_dict={x:batch[0], y:batch[1], keep_prob:1.0})
summary_writer.add_summary(summary_str, i)
summary_writer.flush()
train_accuracy = accuracy.eval(feed_dict={x:batch[0], y:batch[1], keep_prob:1.0})
saver.save(sess, os.path.join(current_path,'model.ckpt'), i)
print("Step %d Training Accuracy: %f" %((i/100 + 1), train_accuracy))
train.run(feed_dict={x:batch[0], y:batch[1], keep_prob:0.5})
sum=0.0
for i in range(10):
batch_x = mnist.test.images[(i*1000):((i+1)*1000)-1]
batch_y = mnist.test.labels[(i*1000):((i+1)*1000)-1]
sum = sum + accuracy.eval(feed_dict={x:batch_x, y:batch_y, keep_prob:1.0})
print("Test Accuracy: %f" %(sum/10.0))
if tf.gfile.Exists(current_path):
tf.gfile.DeleteRecursively(current_path)
tf.gfile.MakeDirs(current_path)
train()
This is a simple typo.
Change this
h2 = tf.reshape(h1,[-1,7*7*64])
to this
h2 = tf.reshape(h2,[-1,7*7*64])
The error
InvalidArgumentError (see above for traceback): logits and labels must be same size: logits_size=[1000,10] labels_size=[500,10]
[[Node: Loss/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](Loss/Reshape, Loss/Reshape_1)]]
went away.