I merged a Vgg model and a Resnet model. The merged model will be used to extract features from frames of videos. The model is merged correctly, but it shows "AssertionError: Exception encountered when calling layer" while extracting features from frames. I am not finding any mistakes.
The code for extracting features from frames of video is mentioned below
def prepare_all_videos(df, root_dir):
num_samples = len(df)
video_paths = df["video_name"].values.tolist()
##take all classlabels from train_df column named 'tag' and store in labels
labels = df["tag"].values
#convert classlabels to label encoding
labels = label_processor(labels[..., None]).numpy()
# `frame_masks` and `frame_features` are what we will feed to our sequence model.
# `frame_masks` will contain a bunch of booleans denoting if a timestep is
# masked with padding or not.
frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
# For each video.
for idx, path in enumerate(video_paths):
# Gather all its frames and add a batch dimension.
frames = load_video(os.path.join(root_dir, path))
frames = frames[None, ...]
# Initialize placeholders to store the masks and features of the current video.
temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
temp_frame_features = np.zeros(
shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
# Extract features from the frames of the current video.
for i, batch in enumerate(frames):
video_length = batch.shape[0]
length = min(MAX_SEQ_LENGTH, video_length)
for j in range(length):
temp_frame_features[i, j, :] = feature_extractor.predict(
batch[None, j, :]
temp_frame_mask[i, :length] = 1 # 1 = not masked, 0 = masked
frame_features[idx,] = temp_frame_features.squeeze()
frame_masks[idx,] = temp_frame_mask.squeeze()
return (frame_features, frame_masks), labels
train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")
print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")
print(f"train_labels in train set: {train_labels.shape}")
print(f"test_labels in train set: {test_labels.shape}")
The error shows
AssertionError: in user code:
AssertionError: Exception encountered when calling layer 'merged_model' (type Functional).
Could not compute output KerasTensor(type_spec=TensorSpec(shape=(None, 2049), dtype=tf.float32, name=None), name='concatenated_layer/concat:0', description="created by layer 'concatenated_layer'")
Call arguments received by layer 'merged_model' (type Functional):
• inputs=tf.Tensor(shape=(None, 224, 224, 3), dtype=float32)
• training=False
• mask=None
The main error is in following line
temp_frame_features[i, j, :] = feature_extractor.predict(
batch[None, j, :]
Please let me know where am I going wrong? I need a solution as soon as possible.


Resource Exhausted Error while Creating Image captioning model

I have used pre_trained vgg16 for cnn_part to get features of image (which I am not training) and defining the decoder class, which is trained through model. I don't know how resources are getting exhausted in just training decoder part, which I think is not too complex as vgg16. Here I am attaching all the relevant code .
Here is code for vgg16 -->
image_model = tf.keras.applications.VGG16(include_top=False,weights='imagenet' )
image_model.trainable = False
new_input = image_model.input # Any arbitrary shapes with 3 channels
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
# https://www.tensorflow.org/tutorials/text/image_captioning
class VGG16_Encoder(tf.keras.Model):
# This encoder passes the features through a Fully connected layer
def __init__(self , cnn_model ):
super(VGG16_Encoder, self).__init__()
# shape after fc : (batch_size, 49, embedding_dim)
self.conv_base = cnn_model
#self.fc = tf.keras.layers.Dense(embedding_dim)
#self.dropout = tf.keras.layers.Dropout(0.5, noise_shape=None, seed=None)
def call(self, x):
#x = self.fc(x)
#x = tf.nn.relu(x)
x = self.conv_base(x)
x = tf.reshape(x , (BATCH_SIZE, 49 , 512))
return x
Here is the code of decoder --->
def rnn_type(units):
# If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
# the code automatically does that.
if tf.test.is_gpu_available():
return tf.compat.v1.keras.layers.CuDNNGRU(units,
return tf.keras.layers.GRU(units,
'''The encoder_output(i.e. 'features'), hidden_state(initialized to 0)(i.e. 'hidden') and
the decoder_input (which is the start token)(i.e. 'x') is passed to the decoder.'''
class Rnn_Local_Decoder(tf.keras.Model):
def __init__(self, embedding_dim, units, vocab_size):
self.units = units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.units,
self.fc1 = tf.keras.layers.Dense(self.units)
self.dropout = tf.keras.layers.Dropout(0.5, noise_shape=None, seed=None)
self.batchnormalization = tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
self.fc2 = tf.keras.layers.Dense(vocab_size)
# Implementing Attention Mechanism
self.U_attn = tf.keras.layers.Dense(units)
self.W_attn = tf.keras.layers.Dense(units)
self.V_attn = tf.keras.layers.Dense(1)
def call(self, x, features, hidden):
# features : (batch_size,49,512) (Output from ENCODER)
# hidden : (batch_size, hidden_size) <==> (64,512)
# hidden_with_time_axis : (batch_size, 1, hidden_size) <==> (64,1,512)
hidden_with_time_axis = tf.expand_dims(hidden, 1)
# score shape : (64, 49, 1)
# Attention Function
'''e_ij = f( s_(t-1) , h_j )
e_ij = V_attn(T)*tanh(U_attn * h_j + W_attn * s_t )'''
score = self.V_attn(tf.nn.tanh(self.U_attn(features) + self.W_attn(hidden_with_time_axis)))
# self.Uattn(features) : (64,49,512)
# self.Wattn(hidden_with_time_axis) : (64,1,512)
# tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden_with_time_axis)) : (64,49,512)
# self.Vattn(tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden_with_time_axis))) : (64,49,1) ==> score
# you get 1 at the last axis because you are applying score to self.Vattn
# Then find Probability using Softmax
'''attention_weights(alpha_ij) = softmax(e_ij)'''
attention_weights = tf.nn.softmax(score, axis=1)
# attention_weights : (64, 49, 1)
# Give weights to the different pixels in the image
''' C(t) = Summation(j=1 to T) (attention_weights * VGG-16 features) '''
context_vector = attention_weights * features
context_vector = tf.reduce_sum(context_vector, axis=1)
# Context Vector(64,256) = AttentionWeights(64,49,1) * features(64,49,256)
# context_vector shape after sum : (64, 256) ---> doing ele_wise sum of features_vec (axis=1)
# x shape after passing through embedding : (64, 1, 256)
x = self.embedding(x)
# x shape after concatenation : (64, 1, 512)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
# passing the concatenated vector to the GRU
output, state = self.gru(x)
# shape == (batch_size, max_length, hidden_size)
x = self.fc1(output)
# x : (batch_size * max_length, hidden_size)
x = tf.reshape(x, (-1, x.shape[2]))
# Adding Dropout and BatchNorm Layers
x= self.dropout(x)
x= self.batchnormalization(x)
# output : (64 * 512)
x = self.fc2(x)
# shape : (64 * 8329(vocab))
return x, state, attention_weights
def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units)) recurrent_initializer='glorot_uniform')
encoder = VGG16_Encoder(image_features_extract_model)
decoder = Rnn_Local_Decoder(embedding_dim, units, vocab_size)
Here is the training code --->
def train_step(img_tensor, target):
loss = 0
# initializing the hidden state for each batch
# because the captions are not related from image to image
hidden = decoder.reset_state(batch_size=target.shape[0])
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
features = encoder(img_tensor)
with tf.GradientTape() as tape:
for i in range(1, max_len):
# passing the features through the decoder
predictions, hidden, _ = decoder(dec_input, features, hidden)
loss += loss_function(target[:, i], predictions)
# using teacher forcing
dec_input = tf.expand_dims(target[:, i], 1)
total_loss = (loss / int(target.shape[1]))
trainable_variables = decoder.trainable_variables
gradients = tape.gradient(loss, trainable_variables)
optimizer.apply_gradients(zip(gradients, trainable_variables))
return loss, total_loss
Here is the error --->
Node: 'gradient_tape/rnn__local__decoder_1/dense_6/MatMul_3/MatMul_1'
Sorry for uploading so much of code , but I feel that all is necessary to sort this issue.
Thanks in advance !!!
I tried to reduce the data from 40000 images to just 500 images , but then also same error stayed. I even tried to reduce batch size, embedding dim of decoder (512-->128) but all in vain.
Kindly help me fix this issue.

ValueError: 'logits' and 'labels' must have the same shape for NLP sentiment multi-class classifier

I am trying to make a NLP multi-class sentiment classifier where it takes in sentences as input and classifies them into three classes (negative, neutral and positive). However, when training the model, I run into the error where my logits (None, 3) are not the same size as my labels (None, 1) and the model can't begin training.
My model is a multi-class classifier and not a multi-label classifier since it is only predicting one label per object. I made sure that my last layer had an output of 3 and had the activation = 'softmax'. This should be correct from what I have searched online so I think that the problem lies with my labels.
Currently, my labels have a dimension of (None, 1) since I mapped each class to a unique integer and passed this as my test and train y values (which are in the form of one dimensional numpy array.
Right now I am confused if I have change the dimensions of this array to match the output dimensions and how to go about doing it.
import os
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
print("Found GPU at: {}".format(device_name))
device_name = "/device:CPU:0"
print("No GPU, using {}.".format(device_name))
# Load dataset into a dataframe
train_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/train.csv"
test_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/test.csv"
train_df = pd.read_csv(train_data_path, encoding='unicode_escape')
test_df = pd.read_csv(test_data_path, encoding='unicode_escape').dropna()
sentiment_types = ('neutral', 'negative', 'positive')
train_df['sentiment'] = train_df['sentiment'].astype('category')
test_df['sentiment'] = test_df['sentiment'].astype('category')
train_df['sentiment_cat'] = train_df['sentiment'].cat.codes
test_df['sentiment_cat'] = test_df['sentiment'].cat.codes
train_y = np.array(train_df['sentiment_cat'])
test_y = np.array(test_df['sentiment_cat'])
# Function to convert df into a list of strings
def convert_to_list(df, x):
selected_text_list = []
labels = []
for index, row in df.iterrows():
return np.array(selected_text_list), np.array(labels)
train_sentences, train_labels = convert_to_list(train_df, 'selected_text')
test_sentences, test_labels = convert_to_list(test_df, 'text')
# Instantiate tokenizer and create word_index
tokenizer = Tokenizer(num_words=1000, oov_token='<oov>')
word_index = tokenizer.word_index
# Convert sentences into a sequence
train_sequence = tokenizer.texts_to_sequences(train_sentences)
test_sequence = tokenizer.texts_to_sequences(test_sentences)
# Padding sequences
pad_test_seq = pad_sequences(test_sequence, padding='post')
max_len = pad_test_seq[0].size
pad_train_seq = pad_sequences(train_sequence, padding='post', maxlen=max_len)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 64, input_length=max_len),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(3, activation='softmax')
with tf.device(device_name):
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
with tf.device(device_name):
history = model.fit(pad_train_seq, train_y, epochs=num_epochs, validation_data=(pad_test_seq, test_y), verbose=2)
Here is the error:
ValueError: `logits` and `labels` must have the same shape, received ((None, 3) vs (None, 1)).
my logits (None, 3) are not the same size as my labels (None, 1)
I made sure that my last layer had an output of 3 and had the activation = 'softmax'
my labels have a dimension of (None, 1) since I mapped each class to a unique integer
The key concept you are missing is that you need to one-hot encode your labels (after assigning integers to them - see below).
So your model, after the softmax, is spitting out three values: how probable each of your labels is. E.g. it might say A is 0.6, B is 0.1, and C is 0.3. If the correct answer is C, then it needs to see that correct answer as 0, 0, 1. It can then say that its prediction for A is 0.6 - 0 = +0.6 wrong, B is 0.1 - 0 = +0.1 wrong, and C is 0.3 - 1 = -0.7 wrong.
Theoretically you can go from a string label directly to a one-hot encoding. But it seems Tensorflow needs the labels to first be encoded as integers, and then that is one-hot encoded.
https://www.tensorflow.org/api_docs/python/tf/keras/layers/CategoryEncoding#examples says to use:
tf.keras.layers.CategoryEncoding(num_tokens=3, output_mode="one_hot")
Also see https://stackoverflow.com/a/69791457/841830 (the higher-voted answer there is from 2019, so applies to TensorFlow v1 I think). And searching for "tensorflow one-hot encoding" will bring up plenty of tutorials and examples.
The issue here was indeed due to the shape of my labels not being the same as logits. Logits were of shape (3) since they contained a float for the probability of each of the three classes that I wanted to predict. Labels were originally of shape (1) since it only contained one int.
To solve this, I used one-hot encoding which turned all labels into a shape of (3) and this solved the problem. Used the keras.utils.to_categorical() function to do so.
sentiment_types = ('negative', 'neutral', 'positive')
train_df['sentiment'] = train_df['sentiment'].astype('category')
test_df['sentiment'] = test_df['sentiment'].astype('category')
# Turning labels from strings to int
train_sentiment_cat = train_df['sentiment'].cat.codes
test_sentiment_cat = test_df['sentiment'].cat.codes
# One-hot encoding
train_y = to_categorical(train_sentiment_cat)
test_y = to_categorical(test_sentiment_cat)

Pytorch: "KeyError: Caught KeyError in DataLoader worker process 0."

Problem Description:
I am trying to load image data using Pytorch custom dataset. I did a little dive deep and found that my images set consist of 2 types of shape (512,512,3) and (1024,1024) . My assumption is, because of the above reason, it is throwing the below error.
Note: The code is able to read some of the images but, it is throwing the below error message for few of them. This was the reason to do a little EDA on the image data and found that there were 2 different shapes of images in the dataset.
Q1. How to preprocess such image data for training?
Q2. Is there any other reasons why I might be seeing the below error message?
Error message:
KeyError: 16481
from torchvision.io import read_image
import torch
from torchvision import transforms
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
class CustomImageDataset(Dataset):
# init
def __init__(self,dataset,transforms=None,target_transforms=None):
#self.train_data = pd.read_csv("Data/train_data.csv")
self.image_ids = dataset.image_id
self.image_labels = dataset.label
self.img_dir = 'Data/images'
self.transforms = transforms
self.target_transforms = target_transforms
# len
def __len__(self):
return len(self.image_ids)
# getitem
def __getitem__(self,idx):
# image path
img_path =os.path.join(self.img_dir,self.image_ids[idx])
# image
image = read_image(img_path)
label = self.image_labels[idx]
# transform image
if self.transforms:
image = self.transforms(image)
# transform target
if self.target_transforms:
label = self.target_transforms(label)
return image, label
Code: train_data is the pandas object of the csv file which has the image id, labesl information.
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(train_data, test_size=0.1, random_state=42)
train_df = CustomImageDataset(X_train)
train_dataloader = torch.utils.data.DataLoader(
found the issue with the code.
Pytorch Custom Dataloader function "getitem" uses idx to retrieve data and my guess is, it know the range of idx from len function, ex: 0, till len(rows in dataset).
In my case, I already had a panda dataset (train_data) with idx as one of the column. When I randomly split it into X_train and X_test, few of the data rows were moved to X_test along with the idx.
Now, when I send X_train to the custom dataloader, it is trying to get row's image_id with an idx and that idx just happens to be in X_test dataset. This lead to error as keyerror: 16481 i.e row with idx=16481 is not present in the X_train dataset. It was moved to X_test during split.
I got the same error while fine-tuning the DistilBertModel transformers-based model in PyTorch while replacing its head.
I've forgotten to reset the indices of train_dataframe and test_dataframe after train_test_split that caused my CustomDatasetto index improperly.

How can I fix the “TypeError: 'Tensor' object is not callable” error in Pytorch?

I am trying to compute a linear function an image's pixels, followed by log softmax (it's for a classification task). I am not sure how to do this without getting errors. Here is the relevant code:
torch.nn.functional.nll_loss(output, target) # error happens here
def __init__(self):
super(NetLin, self).__init__()
self.in_out = torch.nn.Linear(28, 2)
def forward(self, input):
out_sum = self.in_out(input)
output = torch.nn.LogSoftmax(out_sum)
return output
and the full error message I get is:
I have tried a few different solutions to this based on other answers online but they just result in different error messages. Clearly I am doing something fundamentally wrong here but I haven't used Pytorch before so I'm not sure what it is. Thank you
My code is now:
def train(args, model, device, train_loader, optimizer, epoch):
if args.net == 'lin':
model = NetLin()
loss = nn.NLLLoss()
for batch_idx, (data, target) in enumerate(train_loader):
data.requires_grad = True
data, target = data.to(device), target.to(device)
output = loss(model(input), target)
F.nll_loss(output, target)
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
class NetLin(nn.Module):
def __init__(self):
super(NetLin, self).__init__()
self.in_out = torch.nn.Linear(28 * 28, 2)
def forward(self, input):
input = input.view(-1, 28 * 28)
out_sum = self.in_out(input)
output = torch.nn.LogSoftmax(out_sum, dim=1)
return output
and my error message is now:
As you can kind of see the data and target are read in from a file (they are from KMNIST actually) so I can't control their format exactly, but I do know the image sizes are all [1,28,28], i.e. a 28*28 greyscale image. Also the batch size is 64 in case that matters.
Did you remember to set your model to training mode in your train loop with model.train()? Also, nll_loss takes in 2 tensors, but the first entry (the input tensor) needs to have requires_grad=True before it goes through the model, which is also why you need to set model.train() before training.
So you would have something like this:
model = NetLin()
loss = nn.NLLLoss()
input = torch.randn(7, 4, requires_grad=True) # your input image (tensor)
target = torch.tensor([1, 0]) # image label for image belonging to first class
output = loss(model(input), target)
I am also a bit concerned about your self.in_out = torch.nn.Linear(28, 2). This says that your linear layer is expecting 28 features, implying that your input images are either 7x4, 14x2 or 28x1, which doesn't seem right in my opinion? Aren't you using images of size 28x28 (very typical size in this context)? In which case, you would have your linear layer modified as self.in_out = torch.nn.Linear(28*28, 2), and your forward pass will have to be modified as follows:
def forward(self, input):
input = input.view(-1, 28*28)
out_sum = self.in_out(input)
output = torch.nn.LogSoftmax(out_sum)
return output

Invalid Argument error Expected begin[0] = 0

I am currently developing a neural network, and I got all the data and I got the code to the point that an image is being fed to the CNN for training. However, in the training process, for the first image an error pops up with the following code.
def convolutional_neural_network(x):
weights = {'W_conv1':tf.Variable(tf.random_normal([5,5,1,32])),
'out':tf.Variable(tf.random_normal([1024, n_classes]))}
biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
x = tf.reshape(x, shape=[-1, 28, 28, 1])
conv1 = tf.nn.relu(conv2d(x, weights['W_conv1']) + biases['b_conv1'])
conv1 = maxpool2d(conv1)
conv2 = tf.nn.relu(conv2d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv2 = maxpool2d(conv2)
fc = tf.reshape(conv2,[-1, 7*7*64])
fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
fc = tf.nn.dropout(fc, keep_rate)
output = tf.matmul(fc, weights['out'])+biases['out']
return output
def shuffle_unison(images, labels):
shuffleLabel = []
shuffleImage = []
shuffleVector = []
for i in range(0, len(images)-1):
for i in range(0, len(shuffleVector)-1):
return shuffleImage, shuffleLabel
def train_neural_network(x):
prediction = convolutional_neural_network(x)
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(prediction,y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
hm_epochs = 10
# step 4: Batching
with tf.Session() as sess:
init = tf.initialize_all_variables()
#array of strings and corresponding values
image_list, label_list = readImageLables()
for epoch in range(hm_epochs):
epoch_loss = 0
#shuffle every epoch
shuffle_image_list, shuffle_label_list = shuffle_unison(image_list, label_list)
sampleList = ['/home/sciencefair/Desktop/OrchardData/MachineLearningTesting/RottenOranges/result1.jpg']
for i in range(0,7683):
#filename_queue = tf.train.string_input_producer(sampleList)
file_contents = tf.read_file(shuffle_image_list[i])
image = tf.image.decode_jpeg(file_contents, channels=1)
resized_image = tf.image.resize_images(image, [28,28])
#image_batch, label_batch = tf.train.batch([resized_image, shuffle_label_list[i]], batch_size=batch_size) # does train.batch take individual images or final tensors
a = tf.reshape(resized_image,[1, 784])
_, c = sess.run([optimizer, cost], feed_dict={x: tf.reshape(resized_image,[1, 784]).eval(), y: shuffle_label_list[i]})
epoch_loss += c
print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)
The stack trace looked like this
InvalidArgumentError (see above for traceback): Expected begin[0] == 0 (got -1) and size[0] == 0 (got 1) when input.dim_size(0) == 0
[[Node: Slice_1 = Slice[Index=DT_INT32, T=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](Shape_2, Slice_1/begin, Slice_1/size)]]
This error seems to originate from the data causing some confliction with the softmax function. However I have absolutely no idea what is causing this problem.
I followed this tutorial: Sentdex,
First pass through Data w/ 3D ConvNet
to build a 3D CNN and got the same error as yours here.
This error occurs because the dimension of the label vector of my input data (for example, the location of the first label vector in Sentdex's train data is train_data[0][1]) should be the same number as n_classes which in the tutorial is 2.
In my wrong try, I just use a binary value 0 or 1 to represent it, whose dimension is 1 where should be 2. So the tf.nn.softmax_cross_entropy_with_logits() function was confused by the wrong size of label vector.
Try expand your label vectors' dimension to be equal to your n_classes.
