Using WeightedRandomSampler in PyTorch - machine-learning

I need to implement a multi-label image classification model in PyTorch. However my data is not balanced, so I used the WeightedRandomSampler in PyTorch to create a custom dataloader. But when I iterate through the custom dataloader, I get the error : IndexError: list index out of range
Implemented the following code using this link :https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3?u=surajsubramanian
def make_weights_for_balanced_classes(images, nclasses):
count = [0] * nclasses
for item in images:
count[item[1]] += 1
weight_per_class = [0.] * nclasses
N = float(sum(count))
for i in range(nclasses):
weight_per_class[i] = N/float(count[i])
weight = [0] * len(images)
for idx, val in enumerate(images):
weight[idx] = weight_per_class[val[1]]
return weight
weights = make_weights_for_balanced_classes(train_dataset.imgs, len(full_dataset.classes))
weights = torch.DoubleTensor(weights)
sampler = WeightedRandomSampler(weights, len(weights))
train_loader = DataLoader(train_dataset, batch_size=4,sampler = sampler, pin_memory=True)
Based on the answer in https://stackoverflow.com/a/60813495/10077354, the following is my updated code. But then too when I create a dataloader :loader = DataLoader(full_dataset, batch_size=4, sampler=sampler), len(loader) returns 1.
class_counts = [1691, 743, 2278, 1271]
num_samples = np.sum(class_counts)
labels = [tag for _,tag in full_dataset.imgs]
class_weights = [num_samples/class_counts[i] for i in range(len(class_counts)]
weights = [class_weights[labels[i]] for i in range(num_samples)]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), num_samples)
Thanks a lot in advance !
I included an utility function based on the accepted answer below :
def sampler_(dataset):
dataset_counts = imageCount(dataset)
num_samples = sum(dataset_counts)
labels = [tag for _,tag in dataset]
class_weights = [num_samples/dataset_counts[i] for i in range(n_classes)]
weights = [class_weights[labels[i]] for i in range(num_samples)]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
return sampler
The imageCount function finds number of images of each class in the dataset. Each row in the dataset contains the image and the class, so we take the second element in the tuple into consideration.
def imageCount(dataset):
image_count = [0]*(n_classes)
for img in dataset:
image_count[img[1]] += 1
return image_count

That code looks a bit complex... You can try the following:
#Let there be 9 samples and 1 sample in class 0 and 1 respectively
class_counts = [9.0, 1.0]
num_samples = sum(class_counts)
labels = [0, 0,..., 0, 1] #corresponding labels of samples
class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

Here is an alternative solution:
import numpy as np
from torch.utils.data.sampler import WeightedRandomSampler
counts = np.bincount(y)
labels_weights = 1. / counts
weights = labels_weights[y]
WeightedRandomSampler(weights, len(weights))
where y is a list of labels corresponding to each sample, has shape (n_samples,) and are encoded [0, ..., n_classes].
weights won't add up to 1, which is ok according to the official docs.

Related

Why my feature map seems incorrect when the prediction of the class is correct

from torchvision.models.feature_extraction import create_feature_extractor
# Data processing
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)])
image_path = './data/test_images/anemone.jpg'
image = Image.open(image_path).convert('RGB')
img_processed = preprocess(image)
batch_img_cat_tensor = torch.unsqueeze(img_processed, 0)
# Model initialization
resnet50_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# Eval model for predictions
resnet50_model.eval()
# Creating feature extractor (Detailed example here: https://pytorch.org/blog/FX-feature-extraction-torchvision/)
feature_extractor = create_feature_extractor(resnet50_model,
return_nodes=['layer4.2.conv3', 'fc'])
# Forward pass
out = feature_extractor(batch_img_cat_tensor)
pred = torch.argmax(out['fc'])
# Transforming last conv output to numpy and reshaping it so that the channels would be last
last_conv_output = torch.squeeze(out['layer4.2.conv3'])
last_conv_output = torch.reshape(last_conv_output, (7, 7, -1))
last_conv_output = last_conv_output.detach().numpy()
last_conv_output = last_conv_output.astype(np.uint8)
Calculating the upscale factors for last conv output
width_factor = int(image.size[0] / last_conv_output.shape[0])
height_factor = int(image.size[1] / last_conv_output.shape[1])
# Getting the shapes of the last conv output
last_conv_w, last_conv_h, n_channels = last_conv_output.shape
# Calculate the
upscaled_h = last_conv_h * height_factor
upscaled_w = last_conv_w * width_factor
# Upscaling the last_conv_output so that it could be "masked" with original image
upsampled_last_conv_output = np.zeros((upscaled_h, upscaled_w, n_channels))
upsampled_last_conv_output = []
for x in range(0, n_channels, 512):
upsampled_last_conv_output.append(cv2.resize(last_conv_output[:, :, x:x+512], (upscaled_w, upscaled_h), cv2.INTER_CUBIC))
upsampled_last_conv_output = np.concatenate(upsampled_last_conv_output, axis=2)
# Getting the weights of the predicted class
last_layer_weights = resnet50_model.fc.weight.T
last_layer_weights_for_pred = last_layer_weights[:, pred]
# Dot multiplying the upsampled_last_conv_output with last_layer_weights_for_pred
upsampled_last_conv_output = upsampled_last_conv_output.reshape((-1, 2048))
heat_map = np.dot(upsampled_last_conv_output,
last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
# Plotting the results
fig, ax = plt.subplots()
ax.imshow(image)
ax.imshow(heat_map, cmap='jet', alpha=0.5)
ax.set_title(prediction)
I have followed the tutorial from here: https://www.youtube.com/watch?v=GiyldmoYe_M&t=665s&ab_channel=DigitalSreeni
The main problem with this is that I get the feature map that looks like this:
As you see it looks like the model reacts to multiple areas on the image and no matter what image I use it always has the biggest reaction in the middle.
PS. If you think this question should be posted on the AI stack exchange please notify me
I have found an error I made. It was that after creating a
heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
I had to apply this as well:
heat_map = heat_map - np.min(heat_map)
heat_map = heat_map / np.max(heat_map)
Since I normalized the image, the generated heatmap was also normalized, so I needed to "denormalize" it back to it's original values.

Removing Softmax from last layer yields a lot better results

I was solving an nlp task, of converting English sentences to German in Keras. But the model was not learning... But as soon as I removed the softmax from the last layer, it started working! Is this a bug in Keras, or it has to do with something else?
optimizer = Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
EPOCHS = 20
batch_size = 64
batch_per_epoch = int(train_x1.shape[0] / batch_size)
embed_dim = 256
units = 1024
attention_units = 10
encoder_embed = Embedding(english_vocab_size, embed_dim)
decoder_embed = Embedding(german_vocab_size, embed_dim)
encoder = GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
decoder = GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
dense = Dense(german_vocab_size)
attention1 = Dense(attention_units)
attention2 = Dense(attention_units)
attention3 = Dense(1)
def train_step(english_input, german_target):
loss = 0
with tf.GradientTape() as tape:
enc_output, enc_hidden = encoder(encoder_embed(english_input))
dec_hidden = enc_hidden
dec_input = tf.expand_dims([german_tokenizer.word_index['startseq']] * batch_size, 1)
for i in range(1, german_target.shape[1]):
attention_weights = attention1(enc_output) + attention2(tf.expand_dims(dec_hidden, axis=1))
attention_weights = tanh(attention_weights)
attention_weights = attention3(attention_weights)
attention_weights = Softmax(axis=1)(attention_weights)
Context_Vector = tf.reduce_sum(enc_output * attention_weights, axis=1)
Context_Vector = tf.expand_dims(Context_Vector, axis = 1)
x = decoder_embed(dec_input)
x = Concatenate(axis=-1)([x, Context_Vector])
dec_output, dec_hidden = decoder(x)
output = tf.reshape(dec_output, (-1, dec_output.shape[2]))
prediction = dense(output)
loss += loss_function(german_target[:, i], prediction)
dec_input = tf.expand_dims(german_target[:, i], 1)
batch_loss = (loss / int(german_target.shape[1]))
variables = encoder_embed.trainable_variables + decoder_embed.trainable_variables + encoder.trainable_variables + decoder.trainable_variables + dense.trainable_variables + attention1.trainable_variables + attention2.trainable_variables + attention3.trainable_variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
return batch_loss
Code Summary
The code just take the English sentence and German Sentence as input (It takes German sentence as input to implement Teacher-Forcing Method), and predicts the translated German sentence.
The loss function is SparseCategoricalCrossentropy, but it subtracts the loss of the 0. For example, lets say, we have a sentence, that is : 'StartSeq This is Stackoverflow 0 0 0 0 0 EndSeq' (The sentence also has a zero padding to make all the input sentences of the same length). Now, we would calculate loss for every word but not for the 0's. Doing this makes the model better.
Note - this model implementation implements Bahdanau Attention
Question
When I apply softmax on the predicted probabilities by the last layer, the model doesn't learns anything. But it learns properly without softmax in the last layer. Why is this happening?

Neural Network does not perform well on the CIFAR-10 dataset

I have been trying to implement a CNN on the CIFAR-10 dataset for a few days and my test set accuracy does not seem to go beyond the 10% and the error just hang around 69.07733. I have tweaking the model and few days but in vain. I haven't been able to spot out where I am going wrong. Please help me recognise the fault in the model. Here is the code for it:
import os
import sys
import pickle
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
data_root = './cifar-10-batches-py'
train_data = np.ndarray(shape=(50000,3072), dtype=np.float32)
train_labels = np.ndarray(shape=(50000), dtype=np.float32)
num_images = 0
test_data = np.ndarray(shape=(10000,3072),dtype = np.float32)
test_labels = np.ndarray(shape=(10000),dtype=np.float32)
meta_data = {}
for file in os.listdir(data_root):
file_path = os.path.join(data_root,file)
with open(file_path,'rb') as f:
temp = pickle.load(f,encoding ='bytes')
if file == 'batches.meta':
for i,j in enumerate(temp[b'label_names']):
meta_data[i] = j
if 'data_batch_' in file:
for i in range(10000):
train_data[num_images,:] = temp[b'data'][i]
train_labels[num_images] = temp[b'labels'][i]
num_images += 1
if 'test_batch' in file:
for i in range(10000):
test_data[i,:] = temp[b'data'][i]
test_labels[i] = temp[b'labels'][i]
'''
print('meta: \n',meta_data)
train_data = train_data.reshape(50000,3,32,32).transpose(0,2,3,1)
print('\ntrain data: \n',train_data.shape,'\nLabels: \n',train_labels[0])
print('\ntest data: \n',test_data[0].shape,'\nLabels: \n',train_labels[0])'''
#accuracy function acc = (no. of correct prediction/total attempts) * 100
def accuracy(predictions, labels):
return (100 * (np.sum(np.argmax(predictions,1)== np.argmax(labels, 1))/predictions.shape[0]))
#reformat the data
def reformat(data,labels):
data = data.reshape(data.shape[0],3,32,32).transpose(0,2,3,1).astype(np.float32)
labels = (np.arange(10) == labels[:,None]).astype(np.float32)
return data,labels
train_data, train_labels = reformat(train_data,train_labels)
test_data, test_labels = reformat(test_data, test_labels)
print ('Train ',train_data[0][1])
plt.axis("off")
plt.imshow(train_data[1], interpolation = 'nearest')
plt.savefig("1.png")
plt.show()
'''
print("Train: \n",train_data.shape,test_data[0],"\nLabels: \n",train_labels.shape,train_labels[:11])
print("Test: \n",test_data.shape,test_data[0],"\nLabels: \n",test_labels.shape,test_labels[:11])'''
image_size = 32
num_channels = 3
batch_size = 30
patch_size = 5
depth = 64
num_hidden = 256
num_labels = 10
graph = tf.Graph()
with graph.as_default():
#input data and labels
train_input = tf.placeholder(tf.float32,shape=(batch_size,image_size,image_size,num_channels))
train_output = tf.placeholder(tf.float32,shape=(batch_size,num_labels))
test_input = tf.constant(test_data)
#layer weights and biases
layer_1_weights = tf.Variable(tf.truncated_normal([patch_size,patch_size,num_channels,depth]))
layer_1_biases = tf.Variable(tf.zeros([depth]))
layer_2_weights = tf.Variable(tf.truncated_normal([patch_size,patch_size,depth,depth]))
layer_2_biases = tf.Variable(tf.constant(0.1, shape=[depth]))
layer_3_weights = tf.Variable(tf.truncated_normal([64*64, num_hidden]))
layer_3_biases = tf.Variable(tf.constant(0.1, shape=[num_hidden]))
layer_4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels]))
layer_4_biases = tf.Variable(tf.constant(0.1, shape=[num_labels]))
def convnet(data):
conv_1 = tf.nn.conv2d(data, layer_1_weights,[1,1,1,1], padding = 'SAME')
hidden_1 = tf.nn.relu(conv_1+layer_1_biases)
norm_1 = tf.nn.lrn(hidden_1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
pool_1 = tf.nn.max_pool(norm_1,[1,2,2,1],[1,2,2,1], padding ='SAME')
conv_2 = tf.nn.conv2d(pool_1,layer_2_weights,[1,1,1,1], padding = 'SAME')
hidden_2 = tf.nn.relu(conv_2+layer_2_biases)
norm_2 = tf.nn.lrn(hidden_2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
pool_2 = tf.nn.max_pool(norm_2,[1,2,2,1],[1,2,2,1], padding ='SAME')
shape = pool_2.get_shape().as_list()
hidd2_trans = tf.reshape(pool_2,[shape[0],shape[1]*shape[2]*shape[3]])
hidden_3 = tf.nn.relu(tf.matmul(hidd2_trans,layer_3_weights) + layer_3_biases)
return tf.nn.relu(tf.matmul(hidden_3,layer_4_weights) + layer_4_biases)
logits = convnet(train_input)
loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=train_output, logits = logits))
optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
train_prediction = tf.nn.softmax(logits)
test_prediction = tf.nn.softmax(convnet(test_input))
num_steps = 100000
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized \n')
for step in range(num_steps):
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
batch = train_data[offset:(offset+batch_size),:,:,:]
batch_labels = train_labels[offset:(offset+batch_size),:]
feed_dict ={train_input: batch, train_output: batch_labels}
_,l,prediction = session.run([optimizer, loss, train_prediction], feed_dict = feed_dict)
if (step % 500 == 0):
print("Loss at step %d: %f" %(step, l))
print("Accuracy: %f" %(accuracy(prediction, batch_labels)))
print("Test accuracy: %f" %(accuracy(session.run(test_prediction), test_labels)))
On a first glance I would say the initialization of the CNN is the culprit. A convnet is an optimization algorithm in a highly non-convex space and therefore depends a lot on careful initialization to not get stuck on local minima or saddle points. Look at xavier initialization for an example on how to fix that.
Example Code:
W = tf.get_variable("W", shape=[784, 256],
initializer=tf.contrib.layers.xavier_initializer())
Problem is your network is having very high depth(number of filters = 64 for both layers). Also, you are training the network from scratch. And your dataset of CIFAR10 (50000 images) is very little. Moreover, each CIFAR10 image is only 32x32x3 size.
Couple of alternatives what I can suggest you is to retrain a pre-trained model, i.e do transfer learning.
Other better alternative is to reduce the number of filters in each layer. In this way, you will be able to train the model from scratch and also it will be faster. (Assuming you don't have GPU).
Next you are making use of local response normalization. I would suggest you to remove this layer and do mean normalization in pre-processing step.
Next, if you feel the learning is not picking up at all, try increasing the learning rate a little and see.
Lastly, just to reduce some operation in your code, you are reshaping your tensor and then doing transpose in many places like this:
data.reshape(data.shape[0],3,32,32).transpose(0,2,3,1)
Why not directly reshape it to something like this?
data.reshape(data.shape[0], 32, 32, 3)
Hope the answer helps you.

Tensorflow Grid3LSTMCell visualization

I'm having a difficult time visualizing what this Tensorflow class creates. I want to implement a LSTM RNN that handles 3D data.
class Grid3LSTMCell(GridRNNCell):
"""3D BasicLSTM cell
This creates a 2D cell which receives input and gives output in the first dimension.
The first dimension can optionally be non-recurrent if `non_recurrent_fn` is specified.
The second and third dimensions are LSTM.
"""
def __init__(self, num_units, tied=False, non_recurrent_fn=None,
use_peepholes=False, forget_bias=1.0):
super(Grid3LSTMCell, self).__init__(num_units=num_units, num_dims=3,
input_dims=0, output_dims=0, priority_dims=0, tied=tied,
non_recurrent_dims=None if non_recurrent_fn is None else 0,
cell_fn=lambda n, i: rnn_cell.LSTMCell(
num_units=n, input_size=i, forget_bias=forget_bias,
use_peepholes=use_peepholes),
non_recurrent_fn=non_recurrent_fn)
The class is found in `from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell`.
This is difficult to explain, so I've provided a drawing. Here is what I want it to do...
However the comment sounds like it isn't doing this. The comment makes it sound like the RNN is still a flat RNN, where the first dimension is outputting to, what is commonly called, the outputs variable (see below). The second dimension is outputting to the next step in the RNN, and the third dimension is outputting to the next hidden layer.
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
If this is the case, what is the point in having the first and second dimensions? Aren't they essentially the same thing? The BasicLSTMCell sends the output to the next step into outputs -- in other words they are one in the same.
Clarity?
For reference, here is my example code...
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
import numpy as np
#define parameters
learning_rate = 0.01
batch_size = 2
n_input_x = 10
n_input_y = 10
n_input_z = 10
n_hidden = 128
n_classes = 2
n_output = n_input_x * n_classes
x = tf.placeholder("float", [n_input_x, n_input_y, n_input_z])
y = tf.placeholder("float", [n_input_x, n_input_y, n_input_z, n_classes])
weights = {}
biases = {}
for i in xrange(n_input_y * n_input_z):
weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
biases[i] = tf.Variable(tf.random_normal([n_output]))
#generate random data
input_data = np.random.rand(n_input_x, n_input_y, n_input_z)
ground_truth = np.random.rand(n_input_x, n_input_y, n_input_z, n_classes)
#build GridLSTM
def GridLSTM_network(x):
x = tf.reshape(x, [-1,n_input_x])
x = tf.split(0, n_input_y * n_input_z, x)
lstm_cell = grid_rnn_cell.Grid3LSTMCell(n_hidden)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
output = []
for i in xrange(n_input_y * n_input_z):
output.append(tf.matmul(outputs[i], weights[i]) + biases[i])
return output
#initialize network, cost, optimizer and all variables
pred = GridLSTM_network(x)
# import pdb
# pdb.set_trace()
pred = tf.pack(pred)
pred = tf.transpose(pred,[1,0,2])
pred= tf.reshape(pred, [-1, n_input_x, n_input_y, n_input_z, n_classes])
temp_pred = tf.reshape(pred, [-1,n_classes])
temp_y = tf.reshape(y,[-1, n_classes])
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(temp_pred, temp_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(0,tf.cast(tf.sub(tf.nn.sigmoid(temp_pred),temp_y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 0
while 1:
print step
step = step + 1
# pdb.set_trace
sess.run(optimizer, feed_dict={x: input_data, y: ground_truth})

tf.IndexedSlicesValue when returned from tf.gradients()

I'm having the following problem, I have four embedding matrices and want to get the gradients of my loss function with respect to those matrices.
When I run the session to return the values for the gradients, two of those returned objects are of type tensorflow.python.framework.ops.IndexedSlicesValue, the other two are numpy arrays. Now for the numpy arrays, their shape corresponds to the shape of their corresponding embedding matrix, but I'm having problems with the IndexedSlicesValue objects.
If I call .values on one of those objects, I get an array whose shape does not match that of the gradient, the shape of the embedding matrix is [22,30], but calling .values on the IndexedSlicesValue object I get an array with shape [4200,30] ( The shape of my input tensor had dimensions of [30,20,7], the product of those dimensions equals 4200, not sure if this is relevant).
The IndexedSlicesValue object has an attribute called dense_shape, which is an array that holds the dimensions the gradient should have, i.e. array([22,30]) is value returned by .dense_shape.
I don't really understand the docs here: https://www.tensorflow.org/versions/r0.7/api_docs/python/state_ops.html#IndexedSlices
It says:
An IndexedSlices is typically used to represent a subset of a
larger tensor dense of shape [LARGE0, D1, .. , DN] where LARGE0 >> D0.
The values in indices are the indices in the first dimension of the
slices that have been extracted from the larger tensor.
So this array of shape (4200,30) is extracted from an array corresponding to an even larger, dense tensor?
What exactly is the gradient in this IndexedSlicesValue object and why does tensorflow automatically use this type for some gradients returned by tf.gradients()?
Here is my code:
input_tensor = tf.placeholder(tf.int32, shape = [None, memory_size, max_sent_length], name = 'Input')
q_tensor = tf.placeholder(tf.int32, shape = [None,max_sent_length], name = 'Question')
a_tensor = tf.placeholder(tf.float32, shape = [None,V+1], name = 'Answer')
# Embedding matrices
A_prior = tf.get_variable(name = 'A', shape = [V+1,d], initializer = tf.random_normal_initializer(stddev = 0.1))
A = tf.concat(0,[tf.zeros(shape = tf.pack([1,tf.shape(A_prior)[1]])),tf.slice(A_prior,[1,0],[-1,-1])])
B = tf.get_variable(name = 'B', shape = [V+1,d], initializer = tf.random_normal_initializer(stddev = 0.1))
C = tf.get_variable(name = 'C', shape = [V+1,d], initializer = tf.random_normal_initializer(stddev = 0.1))
W = tf.get_variable(name = 'W', shape = [V+1,d], initializer= tf.random_normal_initializer(stddev = 0.1))
embeddings = tf.reduce_sum(tf.nn.embedding_lookup(A,input_tensor),2)
u = tf.reshape(tf.reduce_sum(tf.nn.embedding_lookup(B,q_tensor),1),[-1,1,d])
test = tf.transpose(embeddings, perm = [0,2,1])
test_batch_mul = tf.squeeze(tf.batch_matmul(u,test))
cond = tf.not_equal(test_batch_mul,0.0)
tt = tf.fill(tf.shape(test_batch_mul),-1000.0)
softmax_in = tf.select(cond, test_batch_mul, tt)
p_values = tf.nn.softmax(softmax_in)
c_values = tf.reduce_sum(tf.nn.embedding_lookup(C,input_tensor),2)
o = tf.squeeze(tf.batch_matmul(tf.expand_dims(p_values,1),c_values))
a_pred = tf.nn.softmax(tf.matmul(tf.squeeze(u)+o,tf.transpose(W)))
loss = tf.nn.softmax_cross_entropy_with_logits(a_pred, a_tensor, name = 'loss')
cost = tf.reduce_mean(loss)
global_step = tf.Variable(0,name = 'global_step', trainable= False)
#optimizer = tf.train.MomentumOptimizer(0.01,0.9)
vars_list = tf.trainable_variables()
grads = tf.gradients(cost, vars_list)
#train_op = optimizer.minimize( cost, global_step, vars_list, name = 'train_op')
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
input_feed = {input_tensor : phrases, q_tensor : questions, a_tensor : answers}
grad_results = sess.run(grads, feed_dict = input_feed)
I had the same issue, apparently IndexedSlices objects are automatically created for some Embedding matrices when computing their gradients,
If you want to access the gradients of the trainable variables of the Embedding, you need to convert the IndexedSlices to a tensor, by simply using:
tf.convert_to_tensor(gradients_of_the_embedding_layer)

Resources