System hangs after first epoch training in pytorch - machine-learning

So, I was trying to train on ResNet model in PyTorch using the ImageNet example in the GitHub repository.
Here's what my train method looks like (it is almost similar to that in example)
def train(train_loader, model, criterion, optimizer, epoch):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
args = get_args()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
print(i)
# data loading time
data_time.update(time.time() - end)
if cuda:
target = target.cuda(async = True)
input_var = torch.autograd.Variable(input).cuda()
else:
input_var = torch.autograd.Variable(input)
target_var = torch.autograd.Variable(target)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(prec1.item(), input.size(0))
# top5.update(prec5.item(), input.size(0))
# compute gradient and do optimizer step
optimizer.zero_grad()
loss.backward()
optimizer.step()
#measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
# print to console and write logs to tensorboard
if i % args.print_freq == 0:
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'Prec#1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5))
niter = epoch * len(train_loader) + i
# writer.add_scalar('Train/Loss', losses.val, niter)
# writer.add_scalar('Train/Prec#1', top1.val, niter)
# writer.add_scalar('Train/Prec#5', top5.val, niter)
System Information:
GPU: Nvidia Titan XP
Memory: 32 Gb
PyTorch: 0.4.0
When I run this code, training starts with epoch 0
Epoch: [0][0/108] Time 5.644 (5.644) Data 1.929 (1.929) Loss 6.9052 (6.9052) Prec#1 0.000 (0.000)
And then the remote server automatically disconnects. It happened five times.
And this is the data loader:
#Load the Data --> TRAIN
traindir = 'train'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(traindir, transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
pin_memory=cuda
)
# Load the data --> Validation
valdir = 'valid'
valid_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers,
pin_memory=cuda
)
if args.evaluate:
validate(valid_loader, model, criterion, epoch=0)
return
# Start
for epoch in range(args.start_epoch, args.epochs):
adjust_learning_rate(optimizer, epoch)
# train for epoch
train(train_loader, model, criterion, optimizer, epoch)
# evaluate on valid
prec1 = validate(valid_loader, model, criterion, epoch)
# remember best prec1 and save checkpoint
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
'optimizer': optimizer.state_dict()
}, is_best)
With this params for the loader:
args.num_workers = 4
args.batch_size = 32
pin_memory = torch.cuda.is_available()
Is there something wrong in my approach?

seems a bug in pytorch's dataloader.
try args.num_workers = 0

Related

PyTorch Siamese Network Oscillating / Fluctuating Loss Function

I have implemented a siamese NN for regression using the resnet18 for transfer learning. The goal is to calculate the correlation coefficient between two images, since we do not have raw data but only images for a specific signal. We want to measure similarity between images. However the loss function of my nn is always oscillating up and down.
Code below:
Model itself
class firstNN(nn.Module):
def __init__(self):
# wofür ist das da?
super(firstNN, self).__init__()
self.resnet = models.resnet18(pretrained=True)
for param in self.resnet.parameters():
param.requires_grad = False
# over-write the first conv layer to be able to read images
# as resnet18 reads (3,x,x) where 3 is RGB channels
# whereas MNIST has (1,x,x) where 1 is a gray-scale channel
self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
self.fc_in_features = self.resnet.fc.in_features
# remove the last layer of resnet18 (linear layer which is before avgpool layer)
self.resnet = torch.nn.Sequential(*(list(self.resnet.children())[:-1]))
# add linear layers to compare between the features of the two images
self.fc = nn.Sequential(
nn.Linear(self.fc_in_features, hidden_dim),
torch.nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim2),
torch.nn.ReLU(),
nn.Linear(hidden_dim2,hidden_dim3),
torch.nn.ReLU(),
nn.Linear(hidden_dim3,1),
# nn.ReLU(),
# nn.Linear(input_dim, third_dim),
)
# Distance function
self.binary = False
# Get params and register optimizer
info, params = self.get_model_params()
#self.optimizer = optim.Adam(params, lr=learning_rate,
# weight_decay=weight_decay)
# self.optimizer = optim.SGD(params, lr=learning_rate,
# momentum=0.5)
#self.criterion = nn.BCELoss()
#self.criterion = nn.MSELoss()
LOGGER.info(info)
# Initialisiert die weights mit "random" Werten
def init_layers(self):
nn.init.xavier_normal(self.fc[0].weight.data).to(device)
nn.init.xavier_normal(self.fc[2].weight.data).to(device)
# Erstellt NN mit dem input, inputs ist unser batch
def siamese_basic(self, inputs):
output = self.resnet(inputs)
output = output.view(output.size()[0], -1)
output = self.fc(output)
return output
def distance_layer(self, vec1, vec2, distance='cos'):
if distance == 'cos':
similarity = F.cosine_similarity(
vec1 + 1e-16, vec2 + 1e-16, dim=-1)
elif distance == 'l1':
similarity = self.dist_fc(torch.abs(vec1 - vec2))
similarity = similarity.squeeze(1)
elif distance == 'l2':
similarity = self.dist_fc(torch.abs(vec1 - vec2) ** 2)
similarity = similarity.squeeze(1)
ic()
#if self.binary:
# similarity = F.sigmoid(similarity)
return similarity
def forward(self, template, img):
embed1 = self.siamese_basic(template)
embed2 = self.siamese_basic(img)
# print(f"Before reshape embed2 {embed2.shape}")
# print(f"Befor reshape embed1 {embed1.shape}")
embed1 = embed1.reshape(template.shape[0],-1).float()
embed2 = embed2.reshape(img.shape[0],-1).float()
similarity = self.distance_layer(embed1, embed2)
# print(f"embed2 {embed2.shape}")
# print(f"embed1 {embed1.shape}")
# print(f"similarity {similarity.shape}")
ic()
return similarity#, embed1, embed2
def get_loss(self, outputs, targets):
loss = self.criterion(outputs, targets)
ic()
return loss
def get_model_params(self):
params = []
total_size = 0
def multiply_iter(p_list):
out = 1
for p in p_list:
out *= p
return out
for p in self.parameters():
if p.requires_grad:
params.append(p)
total_size += multiply_iter(p.size())
return '{}\nparam size: {:,}\n'.format(self, total_size), params
def save_checkpoint(self, state, checkpoint_dir, filename):
filename = checkpoint_dir + filename
LOGGER.info('Save checkpoint %s' % filename)
torch.save(state, filename)
def load_checkpoint(self, checkpoint_dir, filename):
filename = checkpoint_dir + filename
LOGGER.info('Load checkpoint %s' % filename)
checkpoint = torch.load(filename)
self.load_state_dict(checkpoint['state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer'])
Choice of criterion etc
model = firstNN()
criterion = nn.MSELoss()
#optimizer = optim.Adam(model.parameters(), lr=learning_rate,
# weight_decay=weight_decay)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
training_data = CustomImageDataset("")
# Create data loaders.
train_loader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
hidden_dim = 128
hidden_dim2 = 64
hidden_dim3 = 32
do_learn = True
save_frequency = 2
batch_size = 40 if torch.cuda.is_available() else 64
learning_rate = 0.0001
num_epochs = 15
weight_decay = 0.1
momentum = 0.9
loss_history = []
r2_history = []
loss_history2 = []
r2_history2 = []
LOGGER = logging.getLogger(__name__)
torch.cuda.empty_cache()
model = firstNN().to(device)
model.train()
for epoch in range (num_epochs):
running_r2 = 0.0
running_loss = 0.0
for batch_idx, (templates, images, targets) in enumerate(train_loader):
templates = templates.unsqueeze(1).float().to(device)
images = images.unsqueeze(1).float().to(device)
targets = targets.float().to(device)
optimizer.zero_grad()
outputs = model(templates, images)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
r2score = torchmetrics.R2Score().to(device)
rscore = r2score(outputs, torch.tensor(targets).squeeze())
running_loss += loss.item()
running_r2 += rscore.item()
loss_history2.append(loss.item())
r2_history2.append(rscore.item())
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tR2Score: {}'.format(
epoch, batch_idx * len(templates), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item(), rscore ))
running_loss = running_loss / len(train_loader)
running_r2 = running_r2 / len(train_loader)
loss_history.append(running_loss)
r2_history.append(running_r2)
Example of images with spearman correlation of 0.45
Example of Oscillating loss and r2
I have tried using several different learning rates and experimented with weight decay and change of optimizer / nn setup but I dont understant exactly how to combat the issue.

Resuming Training PyTorch

I'm attempting to save and load best model through torch, where I've defined my training function as follows:
def train_model(model, train_loader, test_loader, device, learning_rate=1e-1, num_epochs=200):
# The training configurations were not carefully selected.
criterion = nn.CrossEntropyLoss()
model.to(device)
# It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=500)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[65, 75], gamma=0.75, last_epoch=-1)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
print("Epoch: {:02d} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(-1, eval_loss, eval_accuracy))
load_model = input('Load a model?')
for epoch in range(num_epochs):
if epoch//2 == 0:
write_checkpoint(model=model, epoch=epoch, scheduler=scheduler, optimizer=optimizer)
model, optimizer, epoch, scheduler = load_checkpoint(model=model, scheduler=scheduler, optimizer=optimizer)
for state in optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.to(device)
# Training
model.train()
running_loss = 0
running_corrects = 0
for inputs, labels in train_loader:
inputs = torch.FloatTensor(inputs)
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
train_loss = running_loss / len(train_loader.dataset)
train_accuracy = running_corrects / len(train_loader.dataset)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
# Set learning rate scheduler
scheduler.step()
print("Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(epoch, train_loss, train_accuracy, eval_loss, eval_accuracy))
return model
Where I'd like to be able to load a model, and start training from the epoch where model was saved.
So far I have methods to save model, optimizer,scheduler states and the epoch via
def write_checkpoint(model, optimizer, epoch, scheduler):
state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }
filename = '/content/model_'
torch.save(state, filename + f'CP_epoch{epoch + 1}.pth')
def load_checkpoint(model, optimizer, scheduler, filename='/content/checkpoint.pth'):
# Note: Input model & optimizer should be pre-defined. This routine only updates their states.
start_epoch = 0
if os.path.isfile(filename):
print("=> loading checkpoint '{}'".format(filename))
checkpoint = torch.load(filename)
start_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
scheduler = checkpoint['scheduler']
print("=> loaded checkpoint '{}' (epoch {})"
.format(filename, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(filename))
return model, optimizer, start_epoch, scheduler
But I can't seem to come up with the logic of how I'd update the epoch to start at the correct one. Looking for hints or ideas on how to implement just that.
If I understand correctly you trying to resume training from last progress with correct epoch number.
Before calling train_model load the checkpoint values including start_epoch. Then use start_epoch as loop starting point,
for epoch in range(start_epoch, num_epochs):

Pytorch model loss and accuracy remain constant

I need to create a model that takes as input a 351x351x11 Tensor and gives as output a 351x351x11 Tensor (it is an Autoencoder). The two tensors are made of 0s and 1s.
This is the model:
class AutoEncoder(nn.Module):
def __init__(self):
super(AutoEncoder, self).__init__()
self.down_layers=nn.ModuleList()
self.up_layers=nn.ModuleList()
self.n_layers = 1
self.down_layers.append(nn.Conv3d(5,1,(3,3,1)))
self.up_layers.append(nn.ConvTranspose3d(1,5,(3,3,1)))
for d_l in self.down_layers:
torch.nn.init.normal_(d_l.weight, mean=0.5, std=0.7)
for u_l in self.up_layers:
torch.nn.init.normal_(u_l.weight, mean=0.5, std=0.7)
def encode(self, x):
# Encoder
for i in range(len(self.down_layers)):
x = self.down_layers[i](x)
x = torch.sigmoid(x)
return x
def forward(self, x):
# Decoder
x = self.encode(x)
for i in range(len(self.up_layers)):
x = self.up_layers[i](x)
x = torch.sigmoid(x)
if(i==(len(self.up_layers)-1)):
x = torch.round(x)
return x
This is the training function:
max_e,max_p = 351,11 #tensor dimensions
DEVICE = get_device() #device is cpu
EPOCHS = 100
BATCHSIZE=5
try:
print("Start model",flush=True)
# Generate the model.
model = AutoEncoder().to(DEVICE)
lr = 0.09
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
# Training of the model.
for epoch in range(EPOCHS):
model.train()
"""
I have to create 25 dataloaders for 50000 training samples (each of 2000 samples) to avoid memory congestion.
"""
for i in range(25):
train_loader,X_train_shape=get_dataset(i)
N_TRAIN_EXAMPLES = X_train_shape
for batch_idx, (data, target) in enumerate(train_loader):
if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
break
data, target = data[None, ...].to(DEVICE, dtype=torch.float), target[None, ...].to(DEVICE, dtype=torch.float)
optimizer.zero_grad()
output = model(data)
loss = torch.nn.BCELoss()
loss = loss(output, target)
loss.backward()
optimizer.step()
#remove train data loader from memory
del train_loader
print("VALIDATION",flush=True)
# Validation of the model.
model.eval()
correct = 0
tot = 0
with torch.no_grad():
"""
Same with the training, 10 data loaders for 20000 samples
"""
for i in range(25,35):
valid_loader,X_valid_shape=get_dataset(i)
N_VALID_EXAMPLES = X_valid_shape
for batch_idx, (data, target) in enumerate(valid_loader):
# Limiting validation data.
if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
break
data, target = data[None, ...].to(DEVICE, dtype=torch.float), target[None, ...].to(DEVICE, dtype=torch.float)
output = model(data)
# count the number of 1s and 0s predicted correctly
newCorrect= output(target.view_as(output)).sum().item()
correct += newCorrect
tot +=max_e*max_e*max_p*BATCHSIZE
del valid_loader
accuracy = correct*100 / tot
print('Epoch: {} Loss: {} Accuracy: {} %'.format(epoch, loss.data, accuracy),flush=True)
the function that returns the data loader is:
def get_dataset(i):
X_train=[]
Y_train=[]
for j in range(i*2000,(i+1)*2000):
t = torch.load("/home/ubuntu/data/home/ubuntu/deeplogic/el_dataset/x/scene{}.pt".format(j))
X_train.append(t)
t = torch.load("/home/ubuntu/data/home/ubuntu/deeplogic/el_dataset/y/scene{}.pt".format(j))
Y_train.append(t)
train_x = torch.from_numpy(np.array(X_train)).float()
train_y = torch.from_numpy(np.array(Y_train)).float()
batch_size = 1
train = torch.utils.data.TensorDataset(train_x,train_y)
# data loader
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = True)
return train_loader,len(X_train)
The prints I got are :
Epoch: 1 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %
Epoch: 2 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %
Epoch: 3 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %
Epoch: 4 Loss: 99.80729675292969 Accuracy: 0.19852701903983955 %
x = torch.round(x) prevents you from updating your model because it's non-differentiable. More importantly, x = torch.round(x) is redundant for BCELoss. You should move it validation step only. Also, the newCorrect in your validation loop does not compare with target values. (I add the missing eq() in your code.)
# in validation loop
preds = torch.round(output)
newCorrect= preds.eq(target.view_as(preds)).sum().item()

Training Loss When Resuming From a Checkpoint Explodes

I am trying to implement a function in my algorithm which allows me to resume training from a checkpoint. The problem is that when I resume training, my loss explodes by many orders of magnitude, from the order to 0.001 to 1000. I suspect that the problem may be that when training is resumed, the learning rate is not being set properly.
Here is my training function:
def train_gray(epoch, data_loader, device, model, criterion, optimizer, i, path):
train_loss = 0.0
for data in data_loader:
img, _ = data
img = img.to(device)
stand_dev = 0.0392
noisy_img = add_noise(img, stand_dev, device)
output = model(noisy_img, stand_dev)
output = output[:,0:1,:,:]
loss = criterion(output, img)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()*img.size(0)
train_loss = train_loss/len(data_loader)
print('Epoch: {} Complete \tTraining Loss: {:.6f}'.format(
epoch,
train_loss
))
return train_loss
And here is my main function that initialises my variables, loads a checkpoint, calls my training function, and saves a checkpoint after an epoch of training:
def main():
now = datetime.now()
current_time = now.strftime("%H_%M_%S")
path = "/home/bledc/my_remote_folder/denoiser/models/{}_sigma_10_session2".format(current_time)
os.mkdir(path)
width = 256
# height = 256
num_epochs = 25
batch_size = 4
learning_rate = 0.0001
data_loader = load_dataset(batch_size, width)
model = UNetWithResnet50Encoder().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
model.parameters(), lr=learning_rate, weight_decay=1e-5)
############################################################################################
# UNCOMMENT CODE BELOW TO RESUME TRAINING FROM A MODEL
model_path = "/home/bledc/my_remote_folder/denoiser/models/resnet_sigma_10/model_epoch_10.pt"
save_point = torch.load(model_path)
model.load_state_dict(save_point['model_state_dict'])
optimizer.load_state_dict(save_point['optimizer_state_dict'])
epoch = save_point['epoch']
train_loss = save_point['train_loss']
model.train()
############################################################################################
for i in range(epoch, num_epochs+1):
train_loss = train_gray(i, data_loader, device, model, criterion, optimizer, i, path)
checkpoint(i, train_loss, model, optimizer, path)
print("end")
Lastly, here is my function to save checkpoints:
def checkpoint(epoch, train_loss, model, optimizer, path):
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_loss
}, path+"/model_epoch_{}.pt".format(epoch))
print("Epoch saved")
If my problem is that I am not saving my learning rate, how would I do this?
Any help would be greatly appreciated,
Clement
Update: I'm fairly certain that the problem lies in my pretrained model. I am saving the optimiser every epoch but the optimiser only holds information for the trainable layers. I hope to solve this soon and post a more thorough answer when I figure out who to save and load the entire model.

How does one do Inference with Batch Normalization with Tensor Flow?

I was reading the original paper on BN and the stack overflow question on How could I use Batch Normalization in TensorFlow? which provides a very useful piece of code to insert a batch normalization block to a Neural Network but does not provides enough guidance on how to actually use it during training, inference and when evaluating models.
For example, I would like to track the train error during training and test error to make sure I don't overfit. Its clear that the batch normalization block should be off during test, but when evaluating the error on the training set, should the batch normalization block be turned off too? My main questions are:
During inference and error evaluation, should the batch normalization block be turned off regardless of the data set?
Does that mean that the batch normalization block should only be on during the training step then?
To make it very clear, I will provide an extract (of simplified) code I have been using to run batch normalization with Tensor flow according to what is my understanding of what is the right thing to do:
## TRAIN
if phase_train is not None:
#DO BN
feed_dict_train = {x:X_train, y_:Y_train, phase_train: False}
feed_dict_cv = {x:X_cv, y_:Y_cv, phase_train: False}
feed_dict_test = {x:X_test, y_:Y_test, phase_train: False}
else:
#Don't do BN
feed_dict_train = {x:X_train, y_:Y_train}
feed_dict_cv = {x:X_cv, y_:Y_cv}
feed_dict_test = {x:X_test, y_:Y_test}
def get_batch_feed(X, Y, M, phase_train):
mini_batch_indices = np.random.randint(M,size=M)
Xminibatch = X[mini_batch_indices,:] # ( M x D^(0) )
Yminibatch = Y[mini_batch_indices,:] # ( M x D^(L) )
if phase_train is not None:
#DO BN
feed_dict = {x: Xminibatch, y_: Yminibatch, phase_train: True}
else:
#Don't do BN
feed_dict = {x: Xminibatch, y_: Yminibatch}
return feed_dict
with tf.Session() as sess:
sess.run( tf.initialize_all_variables() )
for iter_step in xrange(steps):
feed_dict_batch = get_batch_feed(X_train, Y_train, M, phase_train)
# Collect model statistics
if iter_step%report_error_freq == 0:
train_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_train)
cv_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_cv)
test_error = sess.run(fetches=l2_loss, feed_dict=feed_dict_test)
do_stuff_with_errors(train_error, cv_error, test_error)
# Run Train Step
sess.run(fetches=train_step, feed_dict=feed_dict_batch)
and the code I am using to produce batch normalization blocks is:
def standard_batch_norm(l, x, n_out, phase_train, scope='BN'):
"""
Batch normalization on feedforward maps.
Args:
x: Vector
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope+l):
#beta = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float64 ), name='beta', trainable=True, dtype=tf.float64 )
#gamma = tf.Variable(tf.constant(1.0, shape=[n_out],dtype=tf.float64 ), name='gamma', trainable=True, dtype=tf.float64 )
init_beta = tf.constant(0.0, shape=[n_out], dtype=tf.float64)
init_gamma = tf.constant(1.0, shape=[n_out],dtype=tf.float64)
beta = tf.get_variable(name='beta'+l, dtype=tf.float64, initializer=init_beta, regularizer=None, trainable=True)
gamma = tf.get_variable(name='gamma'+l, dtype=tf.float64, initializer=init_gamma, regularizer=None, trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
I found that there is 'official' batch_norm layer in tensorflow. Try it out:
https://github.com/tensorflow/tensorflow/blob/b826b79718e3e93148c3545e7aa3f90891744cc0/tensorflow/contrib/layers/python/layers/layers.py#L100
Most likely it is not mentioned in docs since it included in some RC or 'beta' version only.
I haven't inspected deep into this matter yet, but as far as I see from documentation you just use binary parameter is_training in this batch_norm layer, and set it to true only for training phase. Try it out.
UPDATE: Below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine.
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Resources