How to write an optimiser for StyleGAN2 interpolation? - machine-learning

I would like to interpolate two images using StyleGAN2-ADA-PyTorch from NVLabs. For the sake of simplicity, it can be said that with two images of different persons I want to create a third image depicting a third person, with a body from the first image, and their head from the second. I also have corresponding w-vectors for the two images ready at hand.
# G is a generative model in line with StyleGAN2, trained to output 512x512 images.
# Latents shape is [1, 16, 512]
G = G.eval().requires_grad_(False).to(device) # type: ignore
num_ws = G.mapping.num_ws # 16
w_dim = G.mapping.w_dim # 512
# Segmentation network is used to extract important parts from images
segmentation_dnn = segmentation_dnn.to(device)
# Source images are represented as latent vectors. I use G to generate actual images:
image_body = image_from_output(G.synthesis(w_body, noise_mode='const'))
image_head = image_from_output(G.synthesis(w_head, noise_mode='const'))
# Custom function is applied to source images, creating masked images.
# In masked images, only head or body is present (and the rest is filled with white pixels)
image_body_masked = apply_segmentation_mask(image_body, segmentation_dnn, select='body')
image_head_masked = apply_segmentation_mask(image_head, segmentation_dnn, select='head')
In order to compare similarity of any two images, I use VGGLos
# VGG16 is used as a feature extractor to evaluate image similarity
url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
with dnnlib.util.open_url(url) as f:
vgg16 = torch.jit.load(f).eval().to(device)
class VGGLoss(nn.Module):
def __init__(self, device, vgg):
super().__init__()
for param in self.parameters():
param.requires_grad = False
self.criterion = nn.L1Loss().to(device)
def forward(self, source, target):
loss = 0
source_features = self.vgg(source, resize_images=False, return_lpips=True)
target_features = self.vgg(target, resize_images=False, return_lpips=True)
loss += self.criterion(source, target)
return loss
vgg_loss = VGGLoss(device, vgg=vgg16)
Now, I want to interpolate image_body and image_head, creating image_target.
To do this, I need to find latent representation of image_target in the latent space of StyleGAN2
Crudely, we can use optimize for a coefficient query_opt to partially include latents from image_body and image_head: w_target = w_body + (query_opt * (w_head - w_person))
query_opt = torch.randn([1, num_ws, 1], dtype=torch.float32, device=device, requires_grad=True)
optimizer = torch.optim.Adam(query_opt, betas=(0.9, 0.999), lr=initial_learning_rate)
w_out = []
for step in num_steps:
# Learning rate schedule.
t = step / num_steps
lr_ramp = min(1.0, (1.0 - t) / lr_rampdown_length)
lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi)
lr_ramp = lr_ramp * min(1.0, t / lr_rampup_length)
lr = initial_learning_rate * lr_ramp
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# Synth image from w_target using query_opt.
# This interpolation formula is an important step, and I think my math might be out of order up here
w_target = w_body + (query_opt * (w_head - w_person))
image_target = image_from_output(G.synthesis(ws, noise_mode='const'))
image_target_body_masked = apply_segmentation_mask(image_target, segmentation_dnn, select='body')
image_target_head_masked = apply_segmentation_mask(image_target, segmentation_dnn, select='head')
loss = vgg_loss(image_body_masked, image_target_body_masked) + vgg_loss(image_head_masked, image_target_head_masked)
# Step
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
logprint(f'step {step+1:>4d}/{num_steps}: loss {float(loss):<5.2f}')
# Save current w_target
w_out[step] = w_target.detach()
I can't figure out how to make my optimizer actually target query_opt in such a way that combined VGGLoss is actually optimized for. I must be missing something in my PyTorch code, or maybe even in the main interpolation formula.

Related

Why my feature map seems incorrect when the prediction of the class is correct

from torchvision.models.feature_extraction import create_feature_extractor
# Data processing
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)])
image_path = './data/test_images/anemone.jpg'
image = Image.open(image_path).convert('RGB')
img_processed = preprocess(image)
batch_img_cat_tensor = torch.unsqueeze(img_processed, 0)
# Model initialization
resnet50_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# Eval model for predictions
resnet50_model.eval()
# Creating feature extractor (Detailed example here: https://pytorch.org/blog/FX-feature-extraction-torchvision/)
feature_extractor = create_feature_extractor(resnet50_model,
return_nodes=['layer4.2.conv3', 'fc'])
# Forward pass
out = feature_extractor(batch_img_cat_tensor)
pred = torch.argmax(out['fc'])
# Transforming last conv output to numpy and reshaping it so that the channels would be last
last_conv_output = torch.squeeze(out['layer4.2.conv3'])
last_conv_output = torch.reshape(last_conv_output, (7, 7, -1))
last_conv_output = last_conv_output.detach().numpy()
last_conv_output = last_conv_output.astype(np.uint8)
Calculating the upscale factors for last conv output
width_factor = int(image.size[0] / last_conv_output.shape[0])
height_factor = int(image.size[1] / last_conv_output.shape[1])
# Getting the shapes of the last conv output
last_conv_w, last_conv_h, n_channels = last_conv_output.shape
# Calculate the
upscaled_h = last_conv_h * height_factor
upscaled_w = last_conv_w * width_factor
# Upscaling the last_conv_output so that it could be "masked" with original image
upsampled_last_conv_output = np.zeros((upscaled_h, upscaled_w, n_channels))
upsampled_last_conv_output = []
for x in range(0, n_channels, 512):
upsampled_last_conv_output.append(cv2.resize(last_conv_output[:, :, x:x+512], (upscaled_w, upscaled_h), cv2.INTER_CUBIC))
upsampled_last_conv_output = np.concatenate(upsampled_last_conv_output, axis=2)
# Getting the weights of the predicted class
last_layer_weights = resnet50_model.fc.weight.T
last_layer_weights_for_pred = last_layer_weights[:, pred]
# Dot multiplying the upsampled_last_conv_output with last_layer_weights_for_pred
upsampled_last_conv_output = upsampled_last_conv_output.reshape((-1, 2048))
heat_map = np.dot(upsampled_last_conv_output,
last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
# Plotting the results
fig, ax = plt.subplots()
ax.imshow(image)
ax.imshow(heat_map, cmap='jet', alpha=0.5)
ax.set_title(prediction)
I have followed the tutorial from here: https://www.youtube.com/watch?v=GiyldmoYe_M&t=665s&ab_channel=DigitalSreeni
The main problem with this is that I get the feature map that looks like this:
As you see it looks like the model reacts to multiple areas on the image and no matter what image I use it always has the biggest reaction in the middle.
PS. If you think this question should be posted on the AI stack exchange please notify me
I have found an error I made. It was that after creating a
heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
I had to apply this as well:
heat_map = heat_map - np.min(heat_map)
heat_map = heat_map / np.max(heat_map)
Since I normalized the image, the generated heatmap was also normalized, so I needed to "denormalize" it back to it's original values.

How to feed previous time-stamp prediction as additional input to the next time-stamp?

This question might have been asked, but I got confused.
I am trying to apply one of RNN types, e.g. LSTM for time-series forecasting. I have inputs, y (stock returns). For each timestamp, I'd like to get the predictions. Q1 - Am I correct choosing seq2seq approach?
I also want to use predictions from previous timestamp (initializing initial values with some constant) as additional (still using my existing inputs) input in the form of squared residuals, i.e. using
eps_{t-1} = (y_{t-1} - y^_{t-1})^2 as additional input at t (as well as previous inputs).
So, how can I do this in tensorflow or in pytorch?
I tried to depict what I want on the attached graph. The graph
p.s. Sorry, it the question is poorly formulated
Let say your input if of dimension (32,10,1) with batch_size 32, time steps of length 10 and dimension of 1. Same for your target (stock return). This code make use of the tf.scan function, which is usefull when implementing custom recurrent networks (it will iterate over the timesteps). It remains to use the residual of t-1 in t somewhere, as you would like to.
ps: it is a very basic implementation of lstm from scratch, without any bias or output activation.
import tensorflow as tf
import numpy as np
tf.reset_default_graph()
BS = 32
TS = 10
inputs_dim = 1
target_dim = 1
inputs = tf.placeholder(shape=[BS, TS, inputs_dim], dtype=tf.float32)
stock_returns = tf.placeholder(shape=[BS, TS, target_dim], dtype=tf.float32)
state_size = 16
# initial hidden state
init_state = tf.placeholder(shape=[2, BS, state_size],
dtype=tf.float32, name='initial_state')
# initializer
xav_init = tf.contrib.layers.xavier_initializer
# params
W = tf.get_variable('W', shape=[4, state_size, state_size],
initializer=xav_init())
U = tf.get_variable('U', shape=[4, inputs_dim, state_size],
initializer=xav_init())
W_out = tf.get_variable('W_out', shape=[state_size, target_dim],
initializer=xav_init())
#the function to feed tf.scan with
def step(prev, inputs_):
#unpack all inputs and previous outputs
st_1, ct_1 = prev[0][0], prev[0][1]
x = inputs_[0]
target = inputs_[1]
#get previous squared residual
eps = prev[1]
"""
here do whatever you want with eps_t-1
like x += eps if x if of the same dimension
or include it somewhere in your graph
"""
# lstm gates (add bias if needed)
#
# input gate
i = tf.sigmoid(tf.matmul(x,U[0]) + tf.matmul(st_1,W[0]))
# forget gate
f = tf.sigmoid(tf.matmul(x,U[1]) + tf.matmul(st_1,W[1]))
# output gate
o = tf.sigmoid(tf.matmul(x,U[2]) + tf.matmul(st_1,W[2]))
# gate weights
g = tf.tanh(tf.matmul(x,U[3]) + tf.matmul(st_1,W[3]))
ct = ct_1*f + g*i
st = tf.tanh(ct)*o
"""
make prediction, compute residual in t
and pass it to t+1
Normaly, we would compute prediction outside the scan function,
but as we need it here, we could just keep it and return it back
as an output of the scan function
"""
prediction_t = tf.matmul(st, W_out) # + bias
eps = (target - prediction_t)**2
return [tf.stack((st, ct), axis=0), eps, prediction_t]
states, eps, preds = tf.scan(step, [tf.transpose(inputs, [1,0,2]),
tf.transpose(stock_returns, [1,0,2])], initializer=[init_state,
tf.zeros((32,1), dtype=tf.float32),
tf.zeros((32,1),dtype=tf.float32)])
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
out = sess.run(preds, feed_dict=
{inputs:np.random.rand(BS,TS,inputs_dim),
stock_returns:np.random.rand(BS,TS,target_dim),
init_state:np.zeros((2,BS,state_size))})
out = tf.transpose(out,[1,0,2])
print(out)
And the output :
Tensor("transpose_2:0", shape=(32, 10, 1), dtype=float32)
Base code from here

Tensorflow - Weight Noise Regularization

I'm trying to implement weight noise regularization like Alex Graves made in his PhD Thesis, but I have several issues in how I should implement that. The algorithm should look like
while stopping criteria not met do
Randomize training set order
for each example in the training set do
Add zero mean Gaussian Noise to weights
Run forward and backward pass to calculate the gradient
Restore original weights
Update weights with gradient descent algorithm
Could anyone shed some light?
Edit 09/16/16
Here is my code:
# e.g: log filter bank or MFCC features
# Has size [batch_size, max_stepsize, num_features], but the
# batch_size and max_stepsize can vary along each step
inputs = tf.placeholder(tf.float32, [None, None, num_features])
# Here we use sparse_placeholder that will generate a
# SparseTensor required by ctc_loss op.
targets = tf.sparse_placeholder(tf.int32)
# 1d array of size [batch_size]
seq_len = tf.placeholder(tf.int32, [None])
# Defining the cell
# Can be:
# tf.nn.rnn_cell.RNNCell
# tf.nn.rnn_cell.GRUCell
cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
# Stacking rnn cells
stack = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers,
state_is_tuple=True)
# The second output is the last state and we will no use that
outputs, _ = tf.nn.dynamic_rnn(cell, inputs, seq_len, dtype=tf.float32)
shape = tf.shape(inputs)
batch_s, max_timesteps = shape[0], shape[1]
# Reshaping to apply the same weights over the timesteps
outputs = tf.reshape(outputs, [-1, num_hidden])
# Truncated normal with mean 0 and stdev=0.1
# Tip: Try another initialization
# see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
W = tf.Variable(tf.truncated_normal([num_hidden,
num_classes],
stddev=0.1))
# Zero initialization
# Tip: Is tf.zeros_initializer the same?
b = tf.Variable(tf.constant(0., shape=[num_classes]))
# Doing the affine projection
logits = tf.matmul(outputs, W) + b
# Reshaping back to the original shape
logits = tf.reshape(logits, [batch_s, -1, num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
loss = tf.contrib.ctc.ctc_loss(logits, targets, seq_len)
cost = tf.reduce_mean(loss)
optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
0.9).minimize(cost)
# Option 2: tf.contrib.ctc.ctc_beam_search_decoder
# (it's slower but you'll get better results)
decoded, log_prob = tf.contrib.ctc.ctc_greedy_decoder(logits, seq_len)
# Inaccuracy: label error rate
ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
targets))
Edit 09/27/16
I realized that I must change my optimizer in order to add noise weight regularizer. But, I have no idea how to insert this on my code.
variables = tf.trainable_variables()
with tf.variable_scope(self.name or "OptimizeLoss", [loss, global_step]):
update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
# Make sure update ops are ran before computing loss.
if update_ops:
loss = control_flow_ops.with_dependencies(list(update_ops), loss)
add_noise_ops = [tf.no_op()]
if self.weights_noise_scale is not None:
add_noise_ops, remove_noise_ops = self._noise_ops(variables, self.weights_noise_scale)
# Make sure add noise to weights before computing loss.
loss = control_flow_ops.with_dependencies(add_noise_ops, loss)
# Compute gradients.
gradients = self._opt.compute_gradients(loss, variables, colocate_gradients_with_ops=self.colocate_gradients_with_ops)
# Optionally add gradient noise.
if self.gradient_noise_scale is not None:
gradients = self._add_scaled_noise_to_gradients(gradients, self.gradient_noise_scale)
# Optionally clip gradients by global norm.
if self.clip_gradients_by_global_norm is not None:
gradients = self._clip_gradients_by_global_norm(gradients, self.clip_gradients_by_global_norm)
# Optionally clip gradients by value.
if self.clip_gradients_by_value is not None:
gradients = self._clip_gradients_by_value(gradients, self.clip_gradients_by_value)
# Optionally clip gradients by norm.
if self.clip_gradients_by_norm is not None:
gradients = self._clip_gradients_by_norm(gradients, self.clip_gradients_by_norm)
self._grads = [g[0] for g in gradients]
self._vars = [g[1] for g in gradients]
# Create gradient updates.
# Make sure that the noise of weights will be removed before the gradient update rule
grad_updates = self._opt.apply_gradients(gradients,
global_step=global_step,
name="train")
# Ensure the train_tensor computes grad_updates.
train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)
Could anyone shed some light on me?
Thanks :)
To solve that I would build 2 graphs: one for training and another for evaluation. The latter won't sum the noise to the weights. To sum a random noise to the weights, you can just do:
W = tf.Variable(tf.truncated_normal([num_hidden,
num_classes],
stddev=0.1))
noise = tf.truncated_normal([num_hidden, num_classes],
stddev=0.001))
W = W + noise
The tensor tf.truncated_normal will add a small amount of random noise to your weights.

Tensorflow cross-entropy NaN, and changing learning rate doesn't seem to have an impact

TL;DR
Trying to build a bidirectional RNN for sequence tagging using tensorflow.
The goal is to take inputs "I like New York" and produce outputs "O O LOC_START LOC"
The graph compiles and runs, but the loss becomes NaN after 1 or 2 batches. I understand this could be a problem with the learning rate, but changing the learning rate seems to have no impact. Using AdamOptimizer at the moment.
Any help would be appreciated.
Here is my code:
Code:
# The input and output: a sequence of words, embedded, and a sequence of word classifications, one-hot
self.input_x = tf.placeholder(tf.float32, [None, n_sequence_length, n_embedding_dim], name="input_x")
self.input_y = tf.placeholder(tf.float32, [None, n_sequence_length, n_output_classes], name="input_y")
# New shape: [sequence_length, batch_size (None), embedding_dim]
inputs = tf.transpose(self.input_x, [1, 0, 2])
# New shape: [sequence_length * batch_size (None), embedding_dim]
inputs = tf.reshape(inputs, [-1, n_embedding_dim])
# Define weights
w_hidden = tf.Variable(tf.random_normal([n_embedding_dim, 2 * n_hidden_states]))
b_hidden = tf.Variable(tf.random_normal([2 * n_hidden_states]))
w_out = tf.Variable(tf.random_normal([2 * n_hidden_states, n_output_classes]))
b_out = tf.Variable(tf.random_normal([n_output_classes]))
# Linear activation for the input; this will make it fit to the hidden size
inputs = tf.nn.xw_plus_b(inputs, w_hidden, b_hidden)
# Split up the batches into a Python list
inputs = tf.split(0, n_sequence_length, inputs)
# Now we define our cell. It takes one word as input, a vector of embedding_size length
cell_forward = rnn_cell.BasicLSTMCell(n_hidden_states, forget_bias=0.0)
cell_backward = rnn_cell.BasicLSTMCell(n_hidden_states, forget_bias=0.0)
# And we add a Dropout Wrapper as appropriate
if is_training and prob_keep < 1:
cell_forward = rnn_cell.DropoutWrapper(cell_forward, output_keep_prob=prob_keep)
cell_backward = rnn_cell.DropoutWrapper(cell_backward, output_keep_prob=prob_keep)
# And we make it a few layers deep
cell_forward_multi = rnn_cell.MultiRNNCell([cell_forward] * n_layers)
cell_backward_multi = rnn_cell.MultiRNNCell([cell_backward] * n_layers)
# returns outputs = a list T of tensors [batch, 2*hidden]
outputs = rnn.bidirectional_rnn(cell_forward_multi, cell_backward_multi, inputs, dtype=dtypes.float32)
# [sequence, batch, 2*hidden]
outputs = tf.pack(outputs)
# [batch, sequence, 2*hidden]
outputs = tf.transpose(outputs, [1, 0, 2])
# [batch * sequence, 2 * hidden]
outputs = tf.reshape(outputs, [-1, 2 * n_hidden_states])
# [batch * sequence, output_classes]
self.scores = tf.nn.xw_plus_b(outputs, w_out, b_out)
# [batch * sequence, output_classes]
inputs_y = tf.reshape(self.input_y, [-1, n_output_classes])
# [batch * sequence]
self.predictions = tf.argmax(self.scores, 1, name="predictions")
# Now calculate the cross-entropy
losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, inputs_y)
self.loss = tf.reduce_mean(losses, name="loss")
if not is_training:
return
# Training
self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
# Evaluate model
correct_pred = tf.equal(self.predictions, tf.argmax(inputs_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy")
Could there be an example in the training data where something is wrong with the labels? Then when it hits that example the cost become NaN. I'm suggesting this because it seems like it still happens when the learning rate is zero and after just a few batches.
Here is how I would debug:
Set the batch size to 1
set the learning rate to 0.0
when you run a batch have tensorflow output the intermediate values not just the cost
run until you get a NaN and then check to see what the input was and by examining the intermediate outputs determine at which point there is a NaN

Stacked RNN model setup in TensorFlow

I'm kind of lost in building up a stacked LSTM model for text classification in TensorFlow.
My input data was something like:
x_train = [[1.,1.,1.],[2.,2.,2.],[3.,3.,3.],...,[0.,0.,0.],[0.,0.,0.],
...... #I trained the network in batch with batch size set to 32.
]
y_train = [[1.,0.],[1.,0.],[0.,1.],...,[1.,0.],[0.,1.]]
# binary classification
The skeleton of my code looks like:
self._input = tf.placeholder(tf.float32, [self.batch_size, self.max_seq_length, self.vocab_dim], name='input')
self._target = tf.placeholder(tf.float32, [self.batch_size, 2], name='target')
lstm_cell = rnn_cell.BasicLSTMCell(self.vocab_dim, forget_bias=1.)
lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.dropout_ratio)
self.cells = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers)
self._initial_state = self.cells.zero_state(self.batch_size, tf.float32)
inputs = tf.nn.dropout(self._input, self.dropout_ratio)
inputs = [tf.reshape(input_, (self.batch_size, self.vocab_dim)) for input_ in
tf.split(1, self.max_seq_length, inputs)]
outputs, states = rnn.rnn(self.cells, inputs, initial_state=self._initial_state)
# We only care about the output of the last RNN cell...
y_pred = tf.nn.xw_plus_b(outputs[-1], tf.get_variable("softmax_w", [self.vocab_dim, 2]), tf.get_variable("softmax_b", [2]))
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_pred, self._target))
correct_pred = tf.equal(tf.argmax(y_pred, 1), tf.argmax(self._target, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
init = tf.initialize_all_variables()
with tf.Session() as sess:
initializer = tf.random_uniform_initializer(-0.04, 0.04)
with tf.variable_scope("model", reuse=True, initializer=initializer):
sess.run(init)
# generate batches here (omitted for clarity)
print sess.run([train_op, loss, accuracy], feed_dict={self._input: batch_x, self._target: batch_y})
The problem is that no matter how large the dataset is, the loss and accuracy has no sign of improvement (looks completely stochastic). Am I doing anything wrong?
Update:
# First, load Word2Vec model in Gensim.
model = Doc2Vec.load(word2vec_path)
# Second, build the dictionary.
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
w2indx = {v: k + 1 for k, v in gensim_dict.items()}
w2vec = {word: model[word] for word in w2indx.keys()}
# Third, read data from a text file.
for fname in fnames:
i = 0
with codecs.open(fname, 'r', encoding='utf8') as fr:
for line in fr:
tmp = []
for t in line.split():
tmp.append(t)
X_train.append(tmp)
i += 1
if i is samples_count:
break
# Fourth, convert words into vectors, and pad each sentence with ZERO arrays to a fixed length.
result = np.zeros((len(data), self.max_seq_length, self.vocab_dim), dtype=np.float32)
for rowNo in xrange(len(data)):
rowLen = len(data[rowNo])
for colNo in xrange(rowLen):
word = data[rowNo][colNo]
if word in w2vec:
result[rowNo][colNo] = w2vec[word]
else:
result[rowNo][colNo] = [0] * self.vocab_dim
for colPadding in xrange(rowLen, self.max_seq_length):
result[rowNo][colPadding] = [0] * self.vocab_dim
return result
# Fifth, generate batches and feed them to the model.
... Trivias ...
Here are few reasons it may not be training and suggestions to try:
You are not allowing to update word vectors, space of pre-learned vectors may be not working properly.
RNNs really need gradient clipping when trained. You can try adding something like this.
Unit scale initialization seems to work better, as it accounts for the size of the layer and allows gradient to be scaled properly as it goes deeper.
You should try removing dropout and second layer - just to check if your data passing is correct and your loss is going down at all.
I also can recommend trying this example with your data: https://github.com/tensorflow/skflow/blob/master/examples/text_classification.py
It trains word vectors from scratch, already has gradient clipping and uses GRUCells which usually are easier to train. You can also see nice visualizations for loss and other things by running tensorboard logdir=/tmp/tf_examples/word_rnn.

Resources