Shape error when passed custom LSTM - machine-learning

I have been trying to custom a LSTM layer for further improvement. But an error which seems like normal raised at pooling layer after my custom LSTM.
My environment is:
win 10
keras 2.2.0
python 3.6
Traceback (most recent call last):
File "E:/PycharmProjects/dialogResearch/dialog/", line 60, in
model = build_model(word_dict, args.max_len, args.max_sents, args.embedding_dim)
File "E:\PycharmProjects\dialogResearch\dialog\model\",
line 177, in build_model
l_dense = TimeDistributed(Dense(200))(l_lstm)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\",
line 592, in call[0])
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\layers\",
line 162, in build
assert len(input_shape) >= 3
The code of my custom LSTM is:
class CustomLSTM(Layer):
def __init__(self, output_dim, return_sequences, **kwargs):
self.init = initializers.get('normal')
# self.input_spec = [InputSpec(ndim=3)]
self.output_dim = output_dim
self.return_sequences = return_sequences
super(CustomLSTM, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.original_shape = input_shape
self.Wi = self.add_weight('Wi', (input_shape[-1], self.output_dim), initializer=self.init, trainable=True)
self.Wf = self.add_weight('Wf', (input_shape[-1], self.output_dim), initializer=self.init, trainable=True)
self.Wo = self.add_weight('Wo', (input_shape[-1], self.output_dim), initializer=self.init, trainable=True)
self.Wu = self.add_weight('Wu', (input_shape[-1], self.output_dim), initializer=self.init, trainable=True)
self.Ui = self.add_weight('Ui', (self.output_dim, self.output_dim), initializer=self.init, trainable=True)
self.Uf = self.add_weight('Uf', (self.output_dim, self.output_dim), initializer=self.init, trainable=True)
self.Uo = self.add_weight('Uo', (self.output_dim, self.output_dim), initializer=self.init, trainable=True)
self.Uu = self.add_weight('Uu', (self.output_dim, self.output_dim), initializer=self.init, trainable=True) = self.add_weight('bi', (self.output_dim,), initializer=self.init, trainable=True) = self.add_weight('bf', (self.output_dim,), initializer=self.init, trainable=True) = self.add_weight('bo', (self.output_dim,), initializer=self.init, trainable=True)
self.bu = self.add_weight('bu', (self.output_dim,), initializer=self.init, trainable=True)
super(CustomLSTM, self).build(input_shape)
def step_op(self, step_in, states):
i = K.softmax(, self.Wi) +[0], self.Ui) +
f = K.softmax(, self.Wf) +[0], self.Uf) +
o = K.softmax(, self.Wo) +[0], self.Uo) +
u = K.tanh(, self.Wu) +[0], self.Uu) + self.bu)
c = i * u + f * states[1]
h = o * K.tanh(c)
return h, [h, c]
def call(self, x, mask=None):
init_states = [K.zeros((K.shape(x)[0], self.output_dim)),
K.zeros((K.shape(x)[0], self.output_dim))]
outputs = K.rnn(self.step_op, x, init_states)
if self.return_sequences:
return outputs[1]
return outputs[0]
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]
The model is:
def build_model(words, max_len, max_sents, embedding_dim):
sentence_input = Input(shape=(max_len,), dtype='int32')
embedding_layer = Embedding(len(words) + 1,
embedded_sequences = embedding_layer(sentence_input)
l_lstm = CustomLSTM(200, return_sequences=True)(embedded_sequences)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer()(l_dense)
sentEncoder = Model(sentence_input, l_att)
review_input = Input(shape=(max_sents, max_len), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = CustomLSTM(200, return_sequences=True)(review_encoder)
l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
l_att_sent = AttLayer()(l_dense_sent)
preds = Dense(3, activation='softmax')(l_att_sent)
model = Model(review_input, preds)
optimizer = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
metrics=[precision, recall, f1, 'acc'])
return model
Thanks for your help.

I presume the error is happening because the shape returned by compute_output_shape when return_sequences=True is incorrect. I would try the following:
def compute_output_shape(self, input_shape):
if self.return_sequences:
return input_shape
return (input_shape[0], input_shape[-1])


Converting generative transformer model from keras to PyTorch

I would like to re-create the following keras model in PyTorch.
vocab_size = 22
maxlen = 200
embed_dim = 256
num_heads = 2
feed_forward_dim = 256
batch_size = 128
decoders = 5
def create_model():
inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
decoder_blocks = []
for i in range(decoders):
decoder_blocks.append(DecoderBlock(embed_dim, num_heads, feed_forward_dim))
for i in range(len(decoder_blocks)):
x = decoder_blocks[i](x)
outputs = layers.Dense(vocab_size)(x)
model = keras.Model(inputs=inputs, outputs=[outputs, x])
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss=[loss_fn, None],
return model
model = create_model()
Here are the Decoder and the TokenAndPositionEmbedding layers along with the Causal Attention Mask
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
i = tf.range(n_dest)[:, None]
j = tf.range(n_src)
m = i >= j - n_src + n_dest
mask = tf.cast(m, dtype)
mask = tf.reshape(mask, [1, n_dest, n_src])
mult = tf.concat(
[tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
return tf.tile(mask, mult)
class DecoderBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(DecoderBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads, embed_dim)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs):
input_shape = tf.shape(inputs)
batch_size = input_shape[0]
seq_len = input_shape[1]
causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
attention_output = self.dropout1(attention_output)
out1 = self.layernorm1(inputs + attention_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
For reference, this code is copied directly from:
I have tried to create equivalent architecture in PyTorch using nn.TransformerDecoderLayer. Apologies for not including my own code, but I have been completely unsuccessful.

'Not callable' error when calculating integrated gradient interpretability with captum

I’m trying to perform model interpretability with captum but running into an error. Specifically, it says:
/usr/lib/python3.7/ in _signature_from_callable(obj, follow_wrapper_chains, skip_bound_arg, sigcls)
2207 if not callable(obj):
-> 2208 raise TypeError('{!r} is not a callable object'.format(obj))
2210 if isinstance(obj, types.MethodType):
I’m not certain how to resolve this. Here’s the definition of my model, for reference:
class dvib(nn.Module):
def __init__(self,k,out_channels, hidden_size):
super(dvib, self).__init__()
self.conv = torch.nn.Conv2d(in_channels=1,
out_channels = out_channels,
kernel_size = (1,20),
self.rnn = torch.nn.GRU(input_size = out_channels,
hidden_size = hidden_size,
num_layers = 2,
bidirectional = True,
batch_first = True,
dropout = 0.2
self.fc1 = nn.Linear(hidden_size*4, hidden_size*4)
self.enc_mean = nn.Linear(hidden_size*4+578,k)
self.enc_std = nn.Linear(hidden_size*4+578,k)
self.dec = nn.Linear(k, 2)
nn.init.constant_(self.fc1.bias, 0.0)
nn.init.constant_(self.enc_mean.bias, 0.0)
nn.init.constant_(self.enc_std.bias, 0.0)
nn.init.constant_(self.dec.bias, 0.0)
def cnn_gru(self,x,lens):
x = x.unsqueeze(1)
print('after first unsqueeze: ', x.shape)
x = self.conv(x)
print('after conv: ', x.shape)
x = torch.nn.ReLU()(x)
print('shape after relu: ', x.shape,type(x))
x = x.squeeze(3)
print('shape after squeeze: ', x.shape)
x = x.view(x.size(0),-1)
x = x.permute(0,2,1)
print('shape after permute: ', x.shape)
gru_input = pack_padded_sequence(x,lens,batch_first=True, enforce_sorted=False)
output, hidden = self.rnn(gru_input)
print('hidden layer: ', hidden.shape)
output_all =[hidden[-1],hidden[-2],hidden[-3],hidden[-4]],dim=1)
return output_all
def forward(self, pssm, lengths, FEGS):
cnn_vectors = self.cnn_gru(pssm, lengths)
feature_vec =[cnn_vectors, FEGS], dim = 1)
enc_mean, enc_std = self.enc_mean(feature_vec), f.softplus(self.enc_std(feature_vec)-5)
eps = torch.randn_like(enc_std)
latent = enc_mean + enc_std*eps
outputs = f.sigmoid(self.dec(latent))
return outputs, enc_mean, enc_std, latent
I load pretrained weights into the model as well, prior to passing it to captum with the relevant arguments:
ig = IntegratedGradients(model(test_pssm_small, test_len_small, test_FEGS_small))
attr = ig.attribute(test_FEGS_small, n_steps=5)

Dimension error in neural network model for classification

Below is the code for Hierarchical Attention Networks, taken from The only difference in the code on the link and mine is that I have 3 classes for classification, whereas they are using 2
maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
#class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(HierarchicalAttentionNetwork, self).__init__()
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weightss = [self.W, self.b, self.u]
super(HierarchicalAttentionNetwork, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tanh(K.bias_add(, self.W), self.b))
ait = K.exp(K.squeeze(, self.u), -1))
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
weighted_input = x * K.expand_dims(ait)
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
input_length=maxlen, trainable=True, mask_zero=True)
sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)
review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(3, activation='softmax')(attn_sentence)
model = Model(review_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
print("model fitting - Hierachical attention network")
Following is the error I get. Please help me understand what the error means and how I can possibly resolve it.

dropout(): argument 'input' (position 1) must be Tensor, not tuple when using XLNet with HuggingfCE

I get an error saying that the input should be of type Tensor, not tuple. I do not know how to work around this problem, as I am already implementing the return_dict=False method as stated in the migration plan.
My model is as follows:
class XLNetClassifier(torch.nn.Module):
def __init__(self, dropout_rate=0.1):
super(XLNetClassifier, self).__init__()
self.XLNet = XLNetModel.from_pretrained('xlnet-base-cased', return_dict=False)
self.d1 = torch.nn.Dropout(dropout_rate)
self.l1 = torch.nn.Linear(768, 64)
self.bn1 = torch.nn.LayerNorm(64)
self.d2 = torch.nn.Dropout(dropout_rate)
self.l2 = torch.nn.Linear(64, 3)
def forward(self, input_ids, attention_mask):
x = self.XLNet(input_ids=input_ids, attention_masks = attention_mask)
x = self.d1(x)
x = self.l1(x)
x = self.bn1(x)
x = torch.nn.Tanh()(x)
x = self.d2(x)
x = self.l2(x)
return x
The error occurs when calling the dropout.
The XLNetModel returns two output values:
That means you get a tuple and not a single tensor as the error message says. Your class definition should therefore be:
from transformers import XLNetModel, XLNetTokenizerFast
import torch
class XLNetClassifier(torch.nn.Module):
def __init__(self, dropout_rate=0.1):
super(XLNetClassifier, self).__init__()
self.XLNet = XLNetModel.from_pretrained('xlnet-base-cased', return_dict=False)
self.d1 = torch.nn.Dropout(dropout_rate)
self.l1 = torch.nn.Linear(768, 64)
self.bn1 = torch.nn.LayerNorm(64)
self.d2 = torch.nn.Dropout(dropout_rate)
self.l2 = torch.nn.Linear(64, 3)
def forward(self, input_ids, attention_mask):
x = self.XLNet(input_ids=input_ids, attention_masks = attention_mask)
x = self.d1(x[0])
x = self.l1(x)
x = self.bn1(x)
x = torch.nn.Tanh()(x)
x = self.d2(x)
x = self.l2(x)
return x
tokenizer = XLNetTokenizerFast.from_pretrained('xlnet-base-cased')
model = XLNetClassifier()
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt", return_token_type_ids=False)
outputs = model(**inputs)
or even better without return_dict=False
class XLNetClassifier(torch.nn.Module):
def __init__(self, dropout_rate=0.1):
super(XLNetClassifier, self).__init__()
self.XLNet = XLNetModel.from_pretrained('xlnet-base-cased')
self.d1 = torch.nn.Dropout(dropout_rate)
self.l1 = torch.nn.Linear(768, 64)
self.bn1 = torch.nn.LayerNorm(64)
self.d2 = torch.nn.Dropout(dropout_rate)
self.l2 = torch.nn.Linear(64, 3)
def forward(self, input_ids, attention_mask):
x = self.XLNet(input_ids=input_ids, attention_masks = attention_mask)
x = self.d1(x.last_hidden_state)
x = self.l1(x)
x = self.bn1(x)
x = torch.nn.Tanh()(x)
x = self.d2(x)
x = self.l2(x)
return x

Attribute Error : Can't pickle local object

I am making a chatbot using pytorch. I developed this chatbot on a linux system which was working flawlessly. But when i am running the same model in windows 10. There is an error :-
Traceback (most recent call last):
return self.wsgi_app(environ, start_response)
return super(_SocketIOMiddleware, self).call(environ,
return self.wsgi_app(environ, start_response)
response = self.handle_exception(e)
return cors_after_request(app.make_response(f(*args, **kwargs)))
reraise(exc_type, exc_value, tb)
raise value
response = self.full_dispatch_request()
rv = self.handle_user_exception(e)
return cors_after_request(app.make_response(f(*args, **kwargs)))
reraise(exc_type, exc_value, tb)
raise value
rv = self.dispatch_request()
return self.view_functionsrule.endpoint > load(candidateSkillset.get(name))
for (words, labels_net) in train_loader:
return _MultiProcessingDataLoaderIter(self)
self._popen = self._Popen(self)
return _default_context.get_context().Process._Popen(process_obj)
return Popen(process_obj)
reduction.dump(process_obj, to_child)
ForkingPickler(file, protocol).dump(obj)
AttributeError: Can't pickle local object 'load..ChatDataset'
exitcode = _main(fd, parent_sentinel)
self = reduction.pickle.load(from_parent)
EOFError: Ran out of input
I am calling the function with the dataset i want it to train on... Here is my code :-
def load(inp):
global words,labels,doc_x,doc_y,questionsP1,questionsP2,questionsP3,questionsP4,model,questionTag, all_words, tags, xy, questions_list
questions_list = []
print( dataFileNames.get(inp) )
with open(dataFileNames.get(inp)) as file:
data = json.load(file)
for intent in data["intents"]:
for proficiency in intent["proficiency"]:
for questions in proficiency["questions"]:
for responses in questions["responses"]:
wrds = tokenize(responses)
xy.append((wrds, questions["question"]))
if questions["tag"] in tags:
if questions["tag"] not in tags:
if proficiency["level"] == "P1":
if proficiency["level"] == "P2":
if proficiency["level"] == "P3":
if proficiency["level"] == "P4":
#PyTorch Implementation
ignore_words = ['?', '!', '.', ',']
all_words = [stem(x) for x in all_words if x not in ignore_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))
X_train = []
y_train = []
for tokenized_response, question in xy:
bag = bag_of_words(tokenized_response, all_words)
label = questions_list.index( question)
X_train = np.array(X_train)
y_train = np.array(y_train)
class ChatDataset(Dataset):
def __init__(self):
self.n_samples = len(X_train)
self.x_data = X_train
self.y_data = y_train
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.n_samples
batch_size = 8
hidden_size = 8
output_size = len(tags)
input_size = len(X_train[0])
learning_rate = 0.001
num_epochs = 1000
dataset = ChatDataset()
train_loader = DataLoader(dataset = dataset, batch_size=batch_size, shuffle = True, num_workers = 2)
device = 'cpu'
# print(device)
global model_main
model_main = NeuralNet(input_size, hidden_size, output_size).to(device)
print("Inside Try")
data = torch.load(modelNames.get(inp))
input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data["all_words"]
questions_list = data["questions_list"]
model_state = data["model_state"]
model_main = NeuralNet(input_size, hidden_size, output_size).to(device)
#loss and optimizer
print("Inside Except")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_main.parameters(), lr = learning_rate)
for epoch in range(num_epochs):
for (words, labels_net) in train_loader:
words =
labels_net =
outputs = model_main(words)
loss = criterion(outputs, labels_net)
# print(loss)
#backward and optimizer step
if (epoch + 1) % 100 == 0:
print(f'epoch {epoch + 1}/ {num_epochs}, loss={loss.item(): .4f}')
print(f'final loss, loss={loss.item(): .4f}')
############### Accuracy Calculation ##############
correct = 0
total = 0
# print(device)
with torch.no_grad():
for words, labels_net in train_loader:
outputs = model_main(words)
_, predicted = torch.max(, 1)
total += labels_net.size(0)
correct += (predicted == labels_net).sum().item()
print('Accuracy : %d %%' % ( 100 * correct / total ))
#Saving the model
data = {
"model_state": model_main.state_dict(),
"input_size": input_size,
"output_size": output_size,
"hidden_size": hidden_size,
"all_words": all_words,
"questions_list": questions_list
FILE = modelNames.get(inp), FILE)
