How to discard a branch after training a pytorch model - machine-learning

I am trying to implement a FCN in pytorch with the overall structure as below:
The code so far looks like below:
class SNet(nn.Module):
def __init__(self):
super(SNet, self).__init__()
self.enc_a = encoder(...)
self.dec_a = decoder(...)
self.enc_b = encoder(...)
self.dec_b = decoder(...)
def forward(self, x1, x2):
x1 = self.enc_a(x1)
x2 = self.enc_b(x2)
x2 = self.dec_b(x2)
x1 = self.dec_a(torch.cat((x1, x2), dim=-1)
return x1, x2
In keras it is relatively easy to do this using the functional API. However, I could not find any concrete example / tutorial to do this in pytorch.
How can I discard the dec_a (decoder part of autoencoder branch) after training?
During joint training the loss will be sum (optionally weighted) of the loss from both the branch?

You can also define separate modes for your model for training and inference:
class SNet(nn.Module):
def __init__(self):
super(SNet, self).__init__()
self.enc_a = encoder(...)
self.dec_a = decoder(...)
self.enc_b = encoder(...)
self.dec_b = decoder(...)
self.training = True
def forward(self, x1, x2):
if self.training:
x1 = self.enc_a(x1)
x2 = self.enc_b(x2)
x2 = self.dec_b(x2)
x1 = self.dec_a(torch.cat((x1, x2), dim=-1)
return x1, x2
else:
x1 = self.enc_a(x1)
x2 = self.enc_b(x2)
x2 = self.dec_b(x2)
return x2
These blocks are examples and may not do exactly what you want because I think there is a bit of ambiguity between how you define the training and inference operations in your block chart vs. your code, but in any case you get the idea of how you can use some modules only during training mode. Then you can just set this variable accordingly.

Related

How to set different learning rates in two-stage training - PyTorch

I am doing two-stage training for my model (shown as the following code).
There are two inputs (x1, and x2) here. My strategy is that:
In the 1st stage, tuning net_core with x1 only. net_stage1 is a supplement for output here, and will not be used in the stage and final model.
In the 2nd stage, tuning net_stage2 by combining x2 and the output from net_core based on x1.
Since net_core was already well tuned in the 1st stage, I want to freeze net_core or apply a very small leanring rate to net_core while I am tuning net_stage2.
How can I implement this idea?
Here is the code for the network.
class net_core(nn.Module):
def __init__(self):
super(net_core, self).__init__()
### code
def forward(self, x):
### code
return out
class net_stage1(nn.Module):
def __init__(self):
super(net_stage1, self).__init__()
### code
def forward(self, x):
### code
return out
class net_stage2(nn.Module):
def __init__(self):
super(net_stage2, self).__init__()
### code
def forward(self, x):
### code
return out
class net_main(nn.Module):
def __init__(self, stage):
super(net_main, self).__init__()
self.stage = stage
self.core = net_core()
self.stage1= net_stage1()
self.stage2= net_stage2()
def forward(self, x1, x2=None):
# x1: N, C, L1
# x2: N, L2
if self.stage == "stage1":
x = self.core(x1)
x = self.stage1(x)
return x
if self.stage == "stage1":
bs = x1.size()[0]
t = self.core(x1)
t = t.view(bs, -1)
x = torch.cat((t, x2), 1)
x = self.stage2(x)
return x
Here is the code for stage1 and stage2 training as an example.
In the stage2, all models are using the same leanring rate. How can I freeze net_core or use different leanring rate?
#### stage1
net = net_main("stage1")
optimizer1 = torch.optim.Adam(params=net.parameters(), lr=0.01, weight_decay = 0.001)
scheduler1 = torch.optim.lr_scheduler.StepLR(optimizer1, step_size=1, gamma=0.5)
for epoch in range(max_epoch):
optimizer1.zero_grad()
digit = model(x1)
loss = criterion(digit, y1)
loss.backward()
optimizer1.step()
scheduler1.step()
#### stage2
net = net_main("stage2")
optimizer2 = torch.optim.Adam(params=net.parameters(), lr=0.0001, weight_decay = 0.001)
scheduler2 = torch.optim.lr_scheduler.StepLR(optimizer2, step_size=1, gamma=0.5)
for epoch in range(max_epoch):
optimizer2.zero_grad()
digit = model(x1,x2)
loss = criterion(digit, y2)
loss.backward()
optimizer2.step()
scheduler2.step()

change model in tensorflow-federated but not work

I try to change model(just and hidden layer) in the tutorial of Federated Learning for Image Classification. But the result shows that w1 and b1 don't change and retain the initial value 0 after multiple iterations. Only w2 and b2 are trainable in the training. Here is my code:
MnistVariables = collections.namedtuple(
'MnistVariables', 'w1 w2 b1 b2 num_examples loss_sum accuracy_sum')
def create_mnist_variables():
return MnistVariables(
w1=tf.Variable(
lambda: tf.zeros(dtype=tf.float32, shape=(784, 128)),
name='w1',
trainable=True),
w2=tf.Variable(
lambda: tf.zeros(dtype=tf.float32, shape=(128, 10)),
name='w2',
trainable=True),
b1=tf.Variable(
lambda: tf.zeros(dtype=tf.float32, shape=(128)),
name='b1',
trainable=True),
b2=tf.Variable(
lambda: tf.zeros(dtype=tf.float32, shape=(10)),
name='b2',
trainable=True),
num_examples=tf.Variable(0.0, name='num_examples', trainable=False),
loss_sum=tf.Variable(0.0, name='loss_sum', trainable=False),
accuracy_sum=tf.Variable(0.0, name='accuracy_sum', trainable=False))
def mnist_forward_pass(variables, batch):
a = tf.add(tf.matmul(batch['x'], variables.w1) , variables.b1)
a= tf.nn.relu(a)
y = tf.nn.softmax(tf.add(tf.matmul(a, variables.w2),variables.b2))
predictions = tf.cast(tf.argmax(y, 1), tf.int32)
flat_labels = tf.reshape(batch['y'], [-1])
loss = -tf.reduce_mean(tf.reduce_sum(
tf.one_hot(flat_labels, 10) * tf.log(y), reduction_indices=[1]))
accuracy = tf.reduce_mean(
tf.cast(tf.equal(predictions, flat_labels), tf.float32))
num_examples = tf.to_float(tf.size(batch['y']))
tf.assign_add(variables.num_examples, num_examples)
tf.assign_add(variables.loss_sum, loss * num_examples)
tf.assign_add(variables.accuracy_sum, accuracy * num_examples)
return loss, predictions
def get_local_mnist_metrics(variables):
return collections.OrderedDict([
('w1', variables.w1),
('w2', variables.w2),
('b1', variables.b1),
('b2', variables.b2),
('num_examples', variables.num_examples),
('loss', variables.loss_sum / variables.num_examples),
('accuracy', variables.accuracy_sum / variables.num_examples)
])
class MnistModel(tff.learning.Model):
def __init__(self):
self._variables = create_mnist_variables()
#property
def trainable_variables(self):
return [self._variables.w1, self._variables.w2,
self._variables.b1, self._variables.b2]
I also add w2 and b2 in the trainable variables. But it seems that they are not trained in the training process and I don't know why. Does anyone have some successful experiences to change model in this tutorial?
I suspect the ReLU activations with zero initialisations of w1 and b1 are problematic and this maybe a case of "dying ReLU" (see What is the “dying ReLU” problem in neural networks?.
Since w1 and b1 are initialized to zero, I would expect the output to also be 0 after the matrix multiply and addition.
Possible options to try: using a non-zero initializer, use an alternative activation function (or don't have an activation after the first layer).

How to calculate the joint log-likelihood for Bernoulli Naive Bayes

For a classification problem using BernoulliNB , how to calculate the joint log-likelihood. The joint likelihood it to be calculated by below formula, where y(d) is the array of actual output (not predicted values) and x(d) is the data set of features.
I read this answer and read the documentation but it didn't exactly served my purpose. Can somebody please
help.
By looking at the code, it looks like there is a hidden undocumented ._joint_log_likelihood(self, X) function in the BernoulliNB which computes the joint log-likelihood.
Its implementation is somewhat consistent with what you ask.
The solution is to count the y(d) of the output.
If the output is True, the y(d) is the [1] in data[idx][1],
else [0] in data[idx][0].
The first block of code calls the _joint_log_likelihood function.
The second block of code is the detail of that function.
The third block of code uses the function on a Bernoulli Naive Bayes dataset.
train, test, train_labels, test_labels = train_test_split(Xs[0], ys[0],
test_size=1./3, random_state=r)
naive = BernoulliNB(alpha= 10**-7)
model = naive.fit(train, train_labels)
joint_log_train = model._joint_log_likelihood(train)
l = [np.append(x,y) for x, y in zip(train, train_labels)]
def count(data, label):
x = 0
for idx, l in enumerate(label):
if (l == True):
x += data[idx][1]
else:
x += data[idx][0]
return x
# Write your code below this line.
for i, (x, y) in enumerate(zip(Xs, ys)):
train, test, train_labels, test_labels = train_test_split(x, y, test_size=1./3, random_state=r)
for j, a in enumerate(alphas):
naive = BernoulliNB(alpha = a)
model = naive.fit(train, train_labels)
joint_log_train = model._joint_log_likelihood(train)
joint_log_test = model._joint_log_likelihood(test)
train_jil[i][j] = count(joint_log_train, train_labels)
test_jil[i][j] = count(joint_log_test, test_labels)

For deep learning, With activation relu the output becomes NAN during training while is normal with tanh

The neural network I trained is the critic network for deep reinforcement learning. The problem is when one of the layer's activation is set to be relu or elu, the output would be nan after some training step, while the output is normal if the activation is tanh. And the code is as follows(based on tensorflow):
with tf.variable_scope('critic'):
self.batch_size = tf.shape(self.tfs)[0]
l_out_x = denseWN(x=self.tfs, name='l3', num_units=self.cell_size, nonlinearity=tf.nn.tanh, trainable=True,shape=[det*step*2, self.cell_size])
l_out_x1 = denseWN(x=l_out_x, name='l3_1', num_units=32, trainable=True,nonlinearity=tf.nn.tanh, shape=[self.cell_size, 32])
l_out_x2 = denseWN(x=l_out_x1, name='l3_2', num_units=32, trainable=True,nonlinearity=tf.nn.tanh,shape=[32, 32])
l_out_x3 = denseWN(x=l_out_x2, name='l3_3', num_units=32, trainable=True,shape=[32, 32])
self.v = denseWN(x=l_out_x3, name='l4', num_units=1, trainable=True, shape=[32, 1])
Here is the code for basic layer construction:
def get_var_maybe_avg(var_name, ema, trainable, shape):
if var_name=='V':
initializer = tf.contrib.layers.xavier_initializer()
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=shape)
if var_name=='g':
initializer = tf.constant_initializer(1.0)
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=[shape[-1]])
if var_name=='b':
initializer = tf.constant_initializer(0.1)
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=[shape[-1]])
if ema is not None:
v = ema.average(v)
return v
def get_vars_maybe_avg(var_names, ema, trainable, shape):
vars=[]
for vn in var_names:
vars.append(get_var_maybe_avg(vn, ema, trainable=trainable, shape=shape))
return vars
def denseWN(x, name, num_units, trainable, shape, nonlinearity=None, ema=None, **kwargs):
with tf.variable_scope(name):
V, g, b = get_vars_maybe_avg(['V', 'g', 'b'], ema, trainable=trainable, shape=shape)
x = tf.matmul(x, V)
scaler = g/tf.sqrt(tf.reduce_sum(tf.square(V),[0]))
x = tf.reshape(scaler,[1,num_units])*x + tf.reshape(b,[1,num_units])
if nonlinearity is not None:
x = nonlinearity(x)
return x
Here is the code to train the network:
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
l1_regularizer = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None)
self.weights = tf.trainable_variables()
regularization_penalty_critic = tf.contrib.layers.apply_regularization(l1_regularizer, self.weights)
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.optimizer = tf.train.RMSPropOptimizer(0.0001, 0.99, 0.0, 1e-6)
self.grads_and_vars = self.optimizer.compute_gradients(self.closs)
self.grads_and_vars = [[tf.clip_by_norm(grad,5), var] for grad, var in self.grads_and_vars if grad is not None]
self.ctrain_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step=tf.contrib.framework.get_global_step())
Looks like you're facing the problem of exploding gradients with ReLu activation function (that what NaN means -- very big activations). There are several techniques to deal with this issue, e.g. batch normalization (changes the network architecture) or a delicate variable initialization (that's what I'd try first).
You are using Xavier initialization for V variables in different layers, which indeed works fine for logistic sigmoid activation (see the paper by Xavier Glorot and Yoshua Bengio), or, in other words, tanh.
The preferred initialization strategy for the ReLU activation function (and its variants, including ELU) is He initialization. In tensorflow it's implemented via tf.variance_scaling_initializer:
initializer = tf.variance_scaling_initializer()
v = tf.get_variable(name=var_name, initializer=initializer, ...)
You might also want to try smaller values for b and g variables, but it's hard to say the exact value just by looking at your model. If nothing helps, consider adding batch-norm layers to your model to control activation distribution.

PyTorch, simple char level RNN, can't overfit one example

I'm new to the PyTorch framework (coming from Theano and Tensorflow mainly):
I've followed the introduction tutorial and read the Classifying Names with a Character-Level RNN one.
I now try to adapt it to a char level LSTM model in order to gain some practical experience with the framework.
Basically I feed in the model sequences of char indices and give as target to the model the same sequence but shifted by one in the future.
However I can't overfit a simple training example and I don't see what I did wrong.
If someone can spot my mistake it would be very helpful.
Here is my code:
class LSTMTxtGen(nn.Module):
def __init__(self, hidden_dim, n_layer, vocab_size):
super(LSTMTxtGen, self).__init__()
self.n_layer = n_layer
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
self.lstm = nn.LSTM(vocab_size, hidden_dim, n_layer, batch_first=True)
# The linear layer that maps from hidden state space to tag space
#self.hidden = self.init_hidden()
def init_hidden(self, batch_size):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
return (autograd.Variable(torch.zeros(self.n_layer, batch_size,
self.hidden_dim)),
autograd.Variable(torch.zeros(self.n_layer, batch_size,
self.hidden_dim)))
def forward(self, seqs):
self.hidden = self.init_hidden(seqs.size()[0])
lstm_out, self.hidden = self.lstm(seqs, self.hidden)
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
lstm_out = nn.Linear(lstm_out.size(1), self.vocab_size)(lstm_out)
return lstm_out
model = LSTMTxtGen (
hidden_dim = 50,
n_layer = 3,
vocab_size = 44,
)
print(Model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adamax(model.parameters())
G = Data.batch_generator(5,100)
batch_per_epoch, to_idx, to_char = next(G)
X, Y = next(G)
for epoch in range(10):
losses = []
for batch_count in range(batch_per_epoch):
model.zero_grad()
#mode.hidden = model.init_hidden()
#X, Y = next(G)
X = autograd.Variable(torch.from_numpy(X))
Y = autograd.Variable(torch.from_numpy(Y))
preds = model(X)
loss = criterion(preds.view(-1, model.vocab_size), Y.view(-1))
loss.backward()
optimizer.step()
losses.append(loss)
if (batch_count % 20 == 0):
print('Loss: ', losses[-1])

Resources