Neural Network Trains Fine and Test Predictions are Horrible Bordering on Ridiculous - machine-learning

I am having a lot of trouble with a neural network model using R neuralnet() function. When I train a network on all of the data as expected the predictions are very accurate. However, when I split the data into training and test sets, the test predictions are terrible. I cannot figure what all I am doing wrong. I would appreciate any advice or help troubleshooting as I don't think I'll be able to figure this out on my own. Thanks in advance.
I have included the R code and some plots and an example of the data below the full data is 3600 observations.
Best Regards-Pat
UPDATE 05/12/18: BASED ON FEEDBACK THAT THIS LOOKS LIKE OVERFITTING, I TRIED STOPPING THE TRAINING EARLIER AND FOUND THAT THE MSE OF THE TEST PREDICTION NEVER GETS VERY LOW AND IS LOWEST APPROACHING 0 TRAINING EPOCHS AND RISES FROM THERE (PLOT INCLUDED AND CODE APPENDED)
###########
#ANN Models
###########
#Load libraries
library(plyr)
library(ggplot2)
library(gridExtra)
library(neuralnet)
#Retain only numerically coded data from data1 in (data2) for ANN fitting
data2 = data1[,c(3:7)]
#Calculate Min and Max for Scaling
max_data = apply(data2,2,max)
min_data = apply(data2,2,min)
#Scale data 0-1
data2_scaled = scale(data2,center=min_data,scale=max_data-min_data)
#Check data structure
data2_scaled
#Fit neural net model
model_nn1 = neuralnet(formula=time~instructions+nodes+machine_num+app_num,data=data2_scaled,hidden=c(8,8),stepmax=1000000,threshold=0.01)
#Calculate Min and Max Response for rescaling
max_time = max(data2$time)
min_time = min(data2$time)
#Rescale neural net response predictions
pred_nn1 = model_nn1$net.result[[1]][,1]*(max_time-min_time)+min_time
#Compare model predictions to actual values
a03 = cbind.data.frame(data1$time,pred_nn1,data1$machine,data1$app)
colnames(a03) = c("actual","prediction","machine","app")
attach(a03)
p01 = ggplot(a03,aes(x=actual,y=prediction))+
geom_point(aes(color=machine),size=1)+
scale_y_continuous("Predicted Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
scale_x_continuous("Actual Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
ggtitle("Neural Net Fit (ALL DATA):\nActual vs. Predicted Execution Time")+
geom_abline(intercept=0,slope=1)+
theme_light()
p02 = ggplot(a03,aes(x=actual,y=prediction))+
geom_point(aes(color=app),size=1)+
scale_y_continuous("Predicted Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
scale_x_continuous("Actual Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
ggtitle("Neural Net Fit (ALL DATA):\nActual vs. Predicted Execution Time")+
geom_abline(intercept=0,slope=1)+
theme_light()
grid.arrange(p01,p02,nrow=1)
#Visualize ANN
plot(model_nn1)
#Epochs taken to train "steps"
model_nn1$result.matrix[3,]
#########################
#Testing and Training ANN
#########################>
#Split the data into a test and training set
index = sample(1:nrow(data2_scaled),round(0.80*nrow(data2_scaled)))
train_data = as.data.frame(data2_scaled[index,])
test_data = as.data.frame(data2_scaled[-index,])
model_nn2 = neuralnet(formula=time~instructions+nodes+machine_num+app_num,data=train_data,hidden=c(3,2),stepmax=1000000,threshold=0.01)
pred_nn2_scaled = compute(model_nn2,test_data[,c(1,2,4,5)])
pred_nn2 = pred_nn2_scaled$net.result*(max_time-min_time)+min_time
test_data_time = test_data$time*(max_time-min_time)+min_time
a04 = cbind.data.frame(test_data_time,pred_nn2,data1[-index,2],data1[-index,1])
colnames(a04) = c("actual","prediction","machine","app")
attach(a04)
p01 = ggplot(a04,aes(x=actual,y=prediction))+
geom_point(aes(color=machine),size=1)+
scale_y_continuous("Predicted Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
scale_x_continuous("Actual Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
ggtitle("Neural Net Fit (TEST DATA):\nActual vs. Predicted Execution Time")+
geom_abline(intercept=0,slope=1)+
theme_light()
p02 = ggplot(a04,aes(x=actual,y=prediction))+
geom_point(aes(color=app),size=1)+
scale_y_continuous("Predicted Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
scale_x_continuous("Actual Execution Time [s]",breaks=seq(0,1000,100),limits=c(0,1000))+
ggtitle("Neural Net Fit (TEST DATA):\nActual vs. Predicted Execution Time")+
geom_abline(intercept=0,slope=1)+
theme_light()
grid.arrange(p01,p02,nrow=1)
#EARLY STOPPING TEST
i = 1000
summary_data = data.frame(matrix(rep(0,4*i),ncol=4))
colnames(summary_data) = c("treshold","epochs","train_mse","test_mse")
for (j in 1:i){
a = runif(1,min=0.01,max=10)
#Train the model
model_nn2 = neuralnet(formula=time~instructions+nodes+machine_num+app_num,data=train_data,hidden=3,stepmax=1000000,threshold=a)
#Calculate Min and Max Response for rescaling
max_time = max(data2$time)
min_time = min(data2$time)
#Predict test data from trained nn
pred_nn2_scaled = compute(model_nn2,test_data[,c(1,2,4,5)])
#Rescale test prediction
pred_test_data_time = pred_nn2_scaled$net.result*(max_time-min_time)+min_time
#Rescale test actual
test_data_time = test_data$time*(max_time-min_time)+min_time
#Rescale train prediction
pred_train_data_time = model_nn2$net.result[[1]][,1]*(max_time-min_time)+min_time
#Rescale train actual
train_data_time = train_data$time*(max_time-min_time)+min_time
#Calculate mse
test_mse = mean((pred_test_data_time-test_data_time)^2)
train_mse = mean((pred_train_data_time-train_data_time)^2)
#Summarize
summary_data[j,1] = a
summary_data[j,2] = model_nn2$result.matrix[3,]
summary_data[j,3] = round(train_mse,3)
summary_data[j,4] = round(test_mse,3)
print(summary_data[j,])
}
plot(summary_data$epochs,summary_data$test_mse,pch=19,xlim=c(0,2000),ylim=c(0,300000),cex=0.5,xlab="Training Steps",ylab="MSE",main="Early Stopping Test: Comparing MSE : TEST=BLACK TRAIN=RED")
points(summary_data$epochs,summary_data$train_mse,pch=19,col=2,cex=0.5)

I would guess that it is overfitting. The network is learning to reproduce the data like a dictionary instead of learning the underlying function in the data. There are various things which can cause this and ways to address them.
Things which cause overfitting are:
The network could be training for too long.
The network could having far more weights than training examples.
Ways to reduce overfitting are:
Create a validation dataset and stop training the network as soon as the
validation set's loss starts increasing. This is a necessity.
Reducing the network size. (Less weights)
Using a regularization technique like weight decay or dropout.
Also, it may be possible that the problem is too difficult for a neural network to solve based on the data it is given. Reproducing training data does not prove that the network can solve the problem, it only proves that the network can remember things like a dictionary.

Related

What is the standard way to train a PyTorch script until convergence?

what is the standard way to detect if a model has converged? I was going to record 5 losses with 95 confidence intervals each loss and if they all agreed then I’d halt the script. I assume training until convergence must be implemented already in PyTorch or PyTorch Lightning somewhere. I don’t need a perfect solution, just the standard way to do this automatically - i.e. halt when converged.
My solution is easy to implement. Once create a criterion and changes the reduction to none. Then it will output a tensor of size [B]. Every you log you record that and it's 95 confidence interval (or std if you prefer, but that is much less accuracy). Then every time you add a new loss with it's confidence interval make sure it remains of size 5 (or 10) and that the 5 losses are within a 95 CI of each other. Then if that is true halt.
You can compute the CI with this:
def torch_compute_confidence_interval(data: Tensor,
confidence: float = 0.95
) -> Tensor:
"""
Computes the confidence interval for a given survey of a data set.
"""
n = len(data)
mean: Tensor = data.mean()
# se: Tensor = scipy.stats.sem(data) # compute standard error
# se, mean: Tensor = torch.std_mean(data, unbiased=True) # compute standard error
se: Tensor = data.std(unbiased=True) / (n**0.5)
t_p: float = float(scipy.stats.t.ppf((1 + confidence) / 2., n - 1))
ci = t_p * se
return mean, ci
and you can create the criterion as follow:
loss: nn.Module = nn.CrossEntropyLoss(reduction='none')
so the train loss is now of size [B].
note that I know how to train with a fixed number of epochs, so I am not really looking for that - just the halting criterion for when to stop when models looks converged, what a person would sort of do when they look at their learning curve but automatically.
ref:
https://forums.pytorchlightning.ai/t/what-is-the-standard-way-to-halt-a-script-when-it-has-converged/1415
Set an EarlyStopping (https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.EarlyStopping.html#pytorch_lightning.callbacks.EarlyStopping) callback in your trainer by
checkpoint_callbacks = [
EarlyStopping(
monitor="val_f1_score",
min_delta=0.01,
patience=10, # NOTE no. val epochs, not train epochs
verbose=False,
mode="min",
),
]
trainer = pl.Trainer(callbacks=callbacks)
This will monitor changes in val_f1_score during training (notice that you have to log this value with self.log("val_f1_score", val_f1) in your pl.LightningModule). And it will stop the training if the minimum change to quantity to qualify as an improvement (min_delta) for more than the number of epoch specified as patience

Non-linear multivariate time-series response prediction using RNN

I am trying to predict the hygrothermal response of a wall, given the interior and exterior climate. Based on literature research, I believe this should be possible with RNN but I have not been able to get good accuracy.
The dataset has 12 input features (time-series of exterior and interior climate data) and 10 output features (time-series of hygrothermal response), both containing hourly values for 10 years. This data was created with hygrothermal simulation software, there is no missing data.
Dataset features:
Dataset targets:
Unlike most time-series prediction problems, I want to predict the response for the full length of the input features time-series at each time-step, rather than the subsequent values of a time-series (eg financial time-series prediction). I have not been able to find similar prediction problems (in similar or other fields), so if you know of one, references are very welcome.
I think this should be possible with RNN, so I am currently using LSTM from Keras. Before training, I preprocess my data the following way:
Discard first year of data, as the first time steps of the hygrothermal response of the wall is influenced by the initial temperature and relative humidity.
Split into training and testing set. Training set contains the first 8 years of data, the test set contains the remaining 2 years.
Normalise training set (zero mean, unit variance) using StandardScaler from Sklearn. Normalise test set analogously using mean an variance from training set.
This results in: X_train.shape = (1, 61320, 12), y_train.shape = (1, 61320, 10), X_test.shape = (1, 17520, 12), y_test.shape = (1, 17520, 10)
As these are long time-series, I use stateful LSTM and cut the time-series as explained here, using the stateful_cut() function. I only have 1 sample, so batch_size is 1. For T_after_cut I have tried 24 and 120 (24*5); 24 appears to give better results. This results in X_train.shape = (2555, 24, 12), y_train.shape = (2555, 24, 10), X_test.shape = (730, 24, 12), y_test.shape = (730, 24, 10).
Next, I build and train the LSTM model as follows:
model = Sequential()
model.add(LSTM(128,
batch_input_shape=(batch_size,T_after_cut,features),
return_sequences=True,
stateful=True,
))
model.addTimeDistributed(Dense(targets)))
model.compile(loss='mean_squared_error', optimizer=Adam())
model.fit(X_train, y_train, epochs=100, batch_size=batch=batch_size, verbose=2, shuffle=False)
Unfortunately, I don't get accurate prediction results; not even for the training set, thus the model has high bias.
The prediction results of the LSTM model for all targets
How can I improve my model? I have already tried the following:
Not discarding the first year of the dataset -> no significant difference
Differentiating the input features time-series (subtract previous value from current value) -> slightly worse results
Up to four stacked LSTM layers, all with the same hyperparameters -> no significant difference in results but longer training time
Dropout layer after LSTM layer (though this is usually used to reduce variance and my model has high bias) -> slightly better results, but difference might not be statistically significant
Am I doing something wrong with the stateful LSTM? Do I need to try different RNN models? Should I preprocess the data differently?
Furthermore, training is very slow: about 4 hours for the model above. Hence I am reluctant to do an extensive hyperparameter gridsearch...
In the end, I managed to solve this the following way:
Using more samples to train instead of only 1 (I used 18 samples to train and 6 to test)
Keep the first year of data, as the output time-series for all samples have the same 'starting point' and the model needs this information to learn
Standardise both input and output features (zero mean, unit variance). I found this improved prediction accuracy and training speed
Use stateful LSTM as described here, but add reset states after epoch (see below for code). I used batch_size = 6 and T_after_cut = 1460. If T_after_cut is longer, training is slower; if T_after_cut is shorter, accuracy decreases slightly. If more samples are available, I think using a larger batch_size will be faster.
use CuDNNLSTM instead of LSTM, this speed up the training time x4!
I found that more units resulted in higher accuracy and faster convergence (shorter training time). Also I found that the GRU is as accurate as the LSTM tough converged faster for the same number of units.
Monitor validation loss during training and use early stopping
The LSTM model is build and trained as follows:
def define_reset_states_batch(nb_cuts):
class ResetStatesCallback(Callback):
def __init__(self):
self.counter = 0
def on_batch_begin(self, batch, logs={}):
# reset states when nb_cuts batches are completed
if self.counter % nb_cuts == 0:
self.model.reset_states()
self.counter += 1
def on_epoch_end(self, epoch, logs={}):
# reset states after each epoch
self.model.reset_states()
return(ResetStatesCallback)
model = Sequential()
model.add(layers.CuDNNLSTM(256, batch_input_shape=(batch_size,T_after_cut ,features),
return_sequences=True,
stateful=True))
model.add(layers.TimeDistributed(layers.Dense(targets, activation='linear')))
optimizer = RMSprop(lr=0.002)
model.compile(loss='mean_squared_error', optimizer=optimizer)
earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0.005, patience=15, verbose=1, mode='auto')
ResetStatesCallback = define_reset_states_batch(nb_cuts)
model.fit(X_dev, y_dev, epochs=n_epochs, batch_size=n_batch, verbose=1, shuffle=False, validation_data=(X_eval,y_eval), callbacks=[ResetStatesCallback(), earlyStopping])
This gave me very statisfying accuracy (R2 over 0.98):
This figure shows the temperature (left) and relative humidity (right) in the wall over 2 years (data not used in training), prediction in red and true output in black. The residuals show that the error is very small and that the LSTM learns to capture the long-term dependencies to predict the relative humidity.

How to find if a data set can train a neural network?

I'm a newbie to machine learning and this is one of the first real-world ML tasks challenged.
Some experimental data contains 512 independent boolean features and a boolean result.
There are about 1e6 real experiment records in the provided data set.
In a classic XOR example all 4 out of 4 possible states are required to train NN. In my case its only 2^(10-512) = 2^-505 which is close to zero.
I have no more information about the data nature, just these (512 + 1) * 1e6 bits.
Tried NN with 1 hidden layer on available data. Output of the trained NN on the samples even from the training set are always close to 0, not a single close to "1". Played with weights initialization, gradient descent learning rate.
My code utilizing TensorFlow 1.3, Python 3. Model excerpt:
with tf.name_scope("Layer1"):
#W1 = tf.Variable(tf.random_uniform([512, innerN], minval=-2/512, maxval=2/512), name="Weights_1")
W1 = tf.Variable(tf.zeros([512, innerN]), name="Weights_1")
b1 = tf.Variable(tf.zeros([1]), name="Bias_1")
Out1 = tf.sigmoid( tf.matmul(x, W1) + b1)
with tf.name_scope("Layer2"):
W2 = tf.Variable(tf.random_uniform([innerN, 1], minval=-2/512, maxval=2/512), name="Weights_2")
#W2 = tf.Variable(tf.zeros([innerN, 1]), name="Weights_2")
b2 = tf.Variable(tf.zeros([1]), name="Bias_2")
y = tf.nn.sigmoid( tf.matmul(Out1, W2) + b2)
with tf.name_scope("Training"):
y_ = tf.placeholder(tf.float32, [None,1])
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels = y_, logits = y)
)
train_step = tf.train.GradientDescentOptimizer(0.005).minimize(cross_entropy)
with tf.name_scope("Testing"):
# Test trained model
correct_prediction = tf.equal( tf.round(y), tf.round(y_))
# ...
# Train
for step in range(500):
batch_xs, batch_ys = Datasets.train.next_batch(300, shuffle=False)
_, my_y, summary = sess.run([train_step, y, merged_summaries],
feed_dict={x: batch_xs, y_: batch_ys})
I suspect two cases:
my fault – bad NN implementation, wrong architecture;
bad data. Compared to XOR example, incomplete training data would result in a failing NN. However, the training examples fed to the trained NN are supposed to give right predictions, aren't they?
How to evaluate if it is possible at all to train a neural network (a 2-layer perceptron) on the provided data to forecast the result? A case of aceptable set would be the XOR example. Opposed to some random noise.
There are only ad hoc ways to know if it is possible to learn a function with a differentiable network from a dataset. That said, these ad hoc ways do usually work. For example, the network should be able to overfit the training set without any regularisation.
A common technique to gauge this is to only fit the network on a subset of the full dataset. Check that the network can overfit to that, then increase the size of the subset, and increase the size of the network as well. Unfortunately, deciding whether to add extra layers or add more units in a hidden layer is an arbitrary decision you'll have to make.
However, looking at your code, there are a few things that could be going wrong here:
Are your outputs balanced? By that I mean, do you have the same number of 1s as 0s in the dataset targets?
Your initialisation in the first layer is all zeros, the gradient to this will be zero, so it can't learn anything (although, you have a real initialisation above it commented out).
Sigmoid nonlinearities are more difficult to optimise than simpler nonlinearities, such as ReLUs.
I'd recommend using the built-in definitions for layers in Tensorflow to not worry about initialisation, and switching to ReLUs in any hidden layers (you need sigmoid at the output for your boolean target).
Finally, deep learning isn't actually very good at most "bag of features" machine learning problems because they lack structure. For example, the order of the features doesn't matter. Other methods often work better, but if you really want to use deep learning then you could look at this recent paper, showing improved performance by just using a very specific nonlinearity and weight initialisation (change 4 lines in your code above).

Tensorflow low train/test accuracy

I restored a pre-trained model in Tensorflow 1.2 to do some testing work. I assumed the model was well-trained since the loss decreased to very low (0.0001). However, with either the testing samples or training samples, the accuracy ops give me a value which is almost 0. Is this because I'm using the wrong accuracy function or is it because the model is the problem?
Here is the accuracy function, the test_image below is a batch with a single test sample, test_image_label is a single label:
correct_prediction = tf.equal(tf.argmax(GoogleNet(test_image), 1), tf.argmax(test_image_label, 0))
accuracy = tf.cast(correct_prediction, tf.float32)
with Session() as less:
accuracy_vector = []
for num in range(len(testnames)):
accuracy_vector.append(sess.run(accuracy, feed_dict={keep_prob: 1.0}))
print(accuracy_vector)
mean_accuracy = sess.run(tf.divide(tf.add_n(accuracy_vector), len(testnames)))
print("test accuracy %g"%mean_accuracy)
The model is defined as GoogleNet(data) above, it is a function that returns the logits of the input batch. The training was done like this:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=train_label_batch, logits=GoogleNet(train_batch)))
train_step = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost, global_step=global_step)
The train_step is ran in every iteration. I think it is worth noting that after restored the model, I cannot run print(GoogleNet(test_image).eval(feed_dict={keep_prob: 1.0})) in the session, with which I intended to take a look at the output of the model. It returns the error of FailedPreconditionError (see above for traceback): Attempting to use uninitialized value Variable_213
[[Node: Variable_213/read = Identity[T=DT_FLOAT, _class=["loc:#Variable_213"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable_213)]]

How to handle gradients when training two sub-graphs simultaneously

The general idea I am trying to realize is a seq2seq-model (taken from the translate.py-example in the models, based on the seq2seq-class). This trains well.
Furthermore I am using the hidden state of the rnn after all the encoding is done, right before decoding starts (I call it the “hidden state at end of encoding”). I use this hidden state at end of encoding to feed it into a further sub-graph which I call “prices” (see below). The training gradients of this sub-graph backprop not only through this additional sub-graph, but also back into the encoder-part of the rnn (which is what I want and need).
The plan is to add more such sub-graph to the hidden state at end of encoding, as I want to analyze the input phrases in a variety of ways.
Now during training when I evaluate and train both sub-graphs (encoder+prices AND encoder+decoder) at the same time, the net does NOT converge. However, if I train by executing the training in the following way (pseudo-code):
if global_step % 10 == 0:
execute-the-price-training_code
else:
execute-the-decoder-training_code
So I am not training both sub-graphs simultaneously. Now it does converge, but the encoder+decoder-part converges MUCH slower than if I ONLY train this part and never train the prices-sub-graph.
My question is: I should be able to train both sub-graphs simultaneously. But probably I have to rescale the gradients flowing back into the hidden state at end of encoding. Here we get the gradients from the prices sub-graph AND from the decoder-sub-graph. How should this rescaling be done. I didnt find any papers describing such an undertaking, but maybe I am searching with the wrong keywords.
Here is the training-part of the code:
This is the (almost original) training-op-preparation:
if not forward_only:
self.gradient_norms = []
self.updates = []
opt = tf.train.AdadeltaOptimizer(self.learning_rate)
for bucket_id in xrange(len(buckets)):
tf.scalar_summary("seq2seq loss", self.losses[bucket_id])
gradients = tf.gradients(self.losses[bucket_id], var_list_seq2seq)
clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
self.gradient_norms.append(norm)
self.updates.append(opt.apply_gradients(zip(clipped_gradients, var_list_seq2seq), global_step=self.global_step))
Now, additionally, I am running a second sub-graph that takes the hidden state at end of encoding as input:
with tf.name_scope('prices') as scope:
#First layer
W_price_first_layer = tf.Variable(tf.random_normal([num_layers*size, self.prices_hidden_layer_size], stddev=0.35), name="W_price_first_layer")
B_price_first_layer = tf.Variable(tf.zeros([self.prices_hidden_layer_size]), name="B_price_first_layer")
self.output_price_first_layer = tf.add(tf.matmul(self.hidden_state, W_price_first_layer), B_price_first_layer)
self.activation_price_first_layer = tf.nn.sigmoid(self.output_price_first_layer)
#self.activation_price_first_layer = tf.nn.Relu(self.output_price_first_layer)
#Second layer to softmax (price ranges)
W_price = tf.Variable(tf.random_normal([self.prices_hidden_layer_size, self.prices_bit_size], stddev=0.35), name="W_price")
W_price_t = tf.transpose(W_price)
B_price = tf.Variable(tf.zeros([self.prices_bit_size]), name="B_price")
self.output_price_second_layer = tf.add(tf.matmul(self.activation_price_first_layer, W_price),B_price)
self.price_prediction = tf.nn.softmax(self.output_price_second_layer)
self.label_price = tf.placeholder(tf.int32, shape=[self.batch_size], name="price_label")
#Remember the prices trainables
var_list_prices = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "prices")
var_list_all = tf.trainable_variables()
#Backprop
self.loss_price = tf.nn.sparse_softmax_cross_entropy_with_logits(self.output_price_second_layer, self.label_price)
self.loss_price_scalar = tf.reduce_mean(self.loss_price)
self.optimizer_price = tf.train.AdadeltaOptimizer(self.learning_rate_prices)
self.training_op_price = self.optimizer_price.minimize(self.loss_price, var_list=var_list_all)
Thx a bunch
I expect that running two optimizers simultaneously will lead to inconsistent gradient updates on the common variables, and this might be causing your training not to converge.
Instead, if you add the scalar loss from each sub-network to the "losses collection" (e.g. via tf.contrib.losses.add_loss() or tf.add_to_collection(tf.GraphKeys.LOSSES, ...), you can use tf.contrib.losses.get_total_loss() to get a single loss value that can be passed to a single standard TensorFlow tf.train.Optimizer subclass. TensorFlow will derive the appropriate back-prop computation for your split network.
The get_total_loss() method simply computes an unweighted sum of the values that have been added to the losses collection. I'm not familiar with the literature on how or if you should scale these values, but you can use any arbitrary (differentiable) TensorFlow expression to combine the losses and pass the result to a single optimizer.

Resources