I am trying to build a CNN+RNN model for my project, but I got an error after concatenating the layers to give it as input to LSTM.
The model I try to build:
The error:
ValueError: Input 0 is incompatible with layer lstm_1: expected ndim=3, found ndim=2
with the following code:
kernel_size1 = 3
kernel_size2 = 5
dropout = 0.2
learning_rate = 0.001
weights = initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=2)
nb_filter = 64
rnn_output_size = 128
hidden_dims = 512
wider = True
deeper = True
def build_model():
input_news = Input(shape=(max_daily_length,), name='News_Input')
embedding = Embedding(input_dim=vocabulary_size, # size of the vocabulary
input_price = Input(shape=(len(selected_features),),
x = Dropout(dropout)(embedding)
x = Convolution1D(filters=nb_filter,
x = MaxPooling1D(pool_size=2)(x)
x = Convolution1D(filters=nb_filter,
x = MaxPooling1D(pool_size=2)(x)
x = Flatten(name='flate_0')(x)
x = Dense(units=1024,
x = Dense(units=1024,
model_concat = concatenate(inputs=[input_price, input_news], axis=-1)
lstm = LSTM(rnn_output_size,
model_concat = Dense(hidden_dims, kernel_initializer=weights)(lstm)
model_concat = Dropout(dropout)(model_concat)
if deeper == True:
model_concat = Dense(hidden_dims//2, kernel_initializer=weights)(model_concat)
model_concat = Dropout(dropout)(model_concat)
model_output = Dense(1, kernel_initializer=weights, name='output')(model_concat)
model = Model(inputs=[input_news, input_price], outputs=[model_output])
I would like to re-create the following keras model in PyTorch.
vocab_size = 22
maxlen = 200
embed_dim = 256
num_heads = 2
feed_forward_dim = 256
batch_size = 128
decoders = 5
def create_model():
inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
decoder_blocks = []
for i in range(decoders):
decoder_blocks.append(DecoderBlock(embed_dim, num_heads, feed_forward_dim))
for i in range(len(decoder_blocks)):
x = decoder_blocks[i](x)
outputs = layers.Dense(vocab_size)(x)
model = keras.Model(inputs=inputs, outputs=[outputs, x])
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss=[loss_fn, None],
return model
model = create_model()
Here are the Decoder and the TokenAndPositionEmbedding layers along with the Causal Attention Mask
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
i = tf.range(n_dest)[:, None]
j = tf.range(n_src)
m = i >= j - n_src + n_dest
mask = tf.cast(m, dtype)
mask = tf.reshape(mask, [1, n_dest, n_src])
mult = tf.concat(
[tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
return tf.tile(mask, mult)
class DecoderBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(DecoderBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads, embed_dim)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs):
input_shape = tf.shape(inputs)
batch_size = input_shape[0]
seq_len = input_shape[1]
causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
attention_output = self.dropout1(attention_output)
out1 = self.layernorm1(inputs + attention_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
For reference, this code is copied directly from: https://keras.io/examples/generative/text_generation_with_miniature_gpt/
I have tried to create equivalent architecture in PyTorch using nn.TransformerDecoderLayer. Apologies for not including my own code, but I have been completely unsuccessful.
Below is the code for Hierarchical Attention Networks, taken from https://github.com/arunarn2/HierarchicalAttentionNetworks. The only difference in the code on the link and mine is that I have 3 classes for classification, whereas they are using 2
maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
#class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(HierarchicalAttentionNetwork, self).__init__()
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weightss = [self.W, self.b, self.u]
super(HierarchicalAttentionNetwork, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
weighted_input = x * K.expand_dims(ait)
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
input_length=maxlen, trainable=True, mask_zero=True)
sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)
review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(3, activation='softmax')(attn_sentence)
model = Model(review_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
print("model fitting - Hierachical attention network")
Following is the error I get. Please help me understand what the error means and how I can possibly resolve it.
Trying to implement ResNet50 on a custom dataset using transfer learning, however get this error:
ValueError: Input 0 of layer global_average_pooling2d_2 is incompatible with the layer: expected ndim=4, found ndim=2. Full shape received: [None, 2048]
Here's my code:
img_height, img_width = (224, 224)
batch_size = 32
train_generator = train_datagen.flow_from_directory(
target_size = (img_height, img_width),
batch_size = batch_size,
class_mode = 'categorical',
subset = 'training')
base_model = ResNet50(include_top = False, weights = 'imagenet', pooling='avg')
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation = 'relu')(x)
predictions = Dense(train_generator.num_classes, activation = 'softmax')(x)
model = Model(inputs = base_model.input, outputs = predictions)
for layer in base_model.layers:
layer.trainable = False
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.fit(train_generator, epochs = 10)
I've set the include_top to False as suggested in some other answers. Where am I going wrong and how do I fix this?
We don't need GlobalAveragePooling2D in this case, try this code:
base_model = ResNet50(include_top = False, weights = 'imagenet', pooling='avg')
x = base_model.output
x = Dense(1024, activation = 'relu')(x)
predictions = Dense(train_generator.num_classes, activation = 'softmax')(x)
model = Model(inputs = base_model.input, outputs = predictions)
Image Transformation and Batch
transform = transforms.Compose([
data_set = datasets.ImageFolder(root="/content/drive/My Drive/models/pokemon/dataset",transform=transform)
train_loader = DataLoader(data_set,batch_size=10,shuffle=True,num_workers=6)
Below is my Model
class pokimonClassifier(nn.Module):
def __init__(self):
self.conv1 = nn.Conv2d(3,6,3,1)
self.conv2 = nn.Conv2d(6,18,3,1)
self.fc1 = nn.Linear(23*23*18,520)
self.fc2 = nn.Linear(520,400)
self.fc3 = nn.Linear(400,320)
self.fc4 = nn.Linear(320,149)
def forward(self,x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x,2,2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x,2,2)
x = x.view(-1,23*23*18)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.log_softmax(self.fc4(x), dim=1)
return x
Creating Instance of model, Use GPU, Set Criterion and optimizer
Here is firsr set lr = 0.001 then later changed to 0.0001
model = pokimonClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.0001)
Training Dataset
for e in range(epochs):
train_crt = 0
for b,(train_x,train_y) in enumerate(train_loader):
train_x, train_y = train_x.to('cuda'), train_y.to('cuda')
# train model
y_preds = model(train_x)
loss = criterion(y_preds,train_y)
# analysis model
predicted = torch.max(y_preds,1)[1]
correct = (predicted == train_y).sum()
train_crt += correct
# print loss and accuracy
if b%50 == 0:
print(f'Epoch {e} batch{b} loss:{loss.item()} ')
# updating weights and bais
My loss value remains between 4 - 3 and its not converging to 0.
I am super new to deep learning and I don't know much about it.
The dataset I am using is here: https://www.kaggle.com/thedagger/pokemon-generation-one
A help will be much appreciated.
Thank You
The problem with your network is that you are applying softmax() twice - once at fc4() layer and once more while using nn.CrossEntropyLoss().
According to the official documentation, Pytorch takes care of softmax() while applying nn.CrossEntropyLoss().
So in your code, please change this line
x = F.log_softmax(self.fc4(x), dim=1)
x = self.fc4(x)
When I use linear or No activation in the last Discriminator layer using weight clipping Discriminator accuracy goes to 1 and Generator goes to 0. In case when I remove weight clipping, Generator accuracy goes to 1 and discriminator goes to 0 around 300 iterations. But when I use sigmoid activation as the last layer in the discriminator with clipping Generator accuracy goes to 1 and without clipping the generator loss get stuck while accuracies going as they should around 0.5.
NOTE - in all cases, results are produced and all of the show WARNING:tensorflow:Discrepancy between trainable weights and collected trainable weights, did you set model.trainable without calling model.compile after ?
Code is given here, please do not mind the indentation on copying and pasting it's everywhere -
class WGAN():
def __init__(self,
self.input_dim = input_dim
self.disc_filter = disc_filter
self.disc_kernel = disc_kernel
self.disc_strides = disc_strides
self.disc_dropout = disc_dropout
self.disc_lr = disc_lr
self.gen_filter = gen_filter
self.gen_kernel = gen_kernel
self.gen_strides = gen_strides
self.gen_upsample = gen_upsample
self.gen_lr = gen_lr
self.z_dim = z_dim
self.batch_size = batch_size
self.weight_init = RandomNormal(mean=0., stddev=0.02)
self.d_losses = []
self.g_losses = []
self.epoch = 0
def wasserstein(self, y_true, y_pred):
return -K.mean(y_true * y_pred)
def Discriminator(self):
disc_input = Input(shape=self.input_dim, name='discriminator_input')
x = disc_input
for i in range(len(self.disc_filter)):
x = Conv2D(filters=self.disc_filter[i], kernel_size=self.disc_kernel[i], strides=self.disc_strides[i], padding='same', name='disc_'+str(i))(x)
x = LeakyReLU()(x)
x = Dropout(self.disc_dropout)(x)
x = BatchNormalization()(x)
x = Flatten()(x)
disc_output = Dense(1, activation='sigmoid', kernel_initializer = self.weight_init)(x)
self.discriminator = Model(disc_input, disc_output)
def Generator(self):
gen_input = Input(shape=(self.z_dim,), name='generator_input')
x = gen_input
x = Dense(7*7*self.batch_size, kernel_initializer = self.weight_init)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = Reshape(target_shape=(7,7,self.batch_size))(x)
for i in range(len(self.gen_filter)):
if self.gen_upsample[i]==2:
x = UpSampling2D(size=self.gen_upsample[i], name='upsample_'+str(i/2))(x)
x = Conv2D(filters=self.gen_filter[i], kernel_size=self.gen_kernel[i], strides=self.gen_strides[i], padding='same', name='gen_'+str(i))(x)
x = Conv2DTranspose(filters=self.gen_filter[i], kernel_size=self.gen_kernel[i], strides=self.gen_strides[i], padding='same', name='gen_'+str(i))(x)
if i<len(self.gen_filter)-1:
x = BatchNormalization()(x)
x = LeakyReLU()(x)
x = Activation("tanh")(x)
gen_output = x
self.generator = Model(gen_input, gen_output)
def set_trainable(self, model, val):
for l in model.layers:
def full_model(self):
self.discriminator.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
self.set_trainable(self.discriminator, False)
self.discriminator.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
model_input = Input(shape=(self.z_dim,), name='model_input')
model_output = self.discriminator(self.generator(model_input))
self.model = Model(model_input, model_output)
self.model.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
self.set_trainable(self.discriminator, True)
def train_generator(self, batch_size):
valid = np.ones((batch_size,1))
noise = np.random.normal(0, 1, (batch_size, self.z_dim))
return self.model.train_on_batch(noise, valid)
def train_discriminator(self, x_train, batch_size, using_generator):
valid = np.ones((batch_size,1))
fake = np.zeros((batch_size,1))
if using_generator:
true_imgs = next(x_train)[0]
if true_imgs.shape[0] != batch_size:
true_imgs = next(x_train)[0]
idx = np.random.randint(0, x_train.shape[0], batch_size)
true_imgs = x_train[idx]
noise = np.random.normal(0, 1, (batch_size, self.z_dim))
gen_imgs = self.generator.predict(noise)
d_loss_real, d_acc_real = self.discriminator.train_on_batch(true_imgs, valid)
d_loss_fake, d_acc_fake = self.discriminator.train_on_batch(gen_imgs, fake)
d_loss = 0.5 * (d_loss_real + d_loss_fake)
d_acc = 0.5 * (d_acc_real + d_acc_fake)
for l in self.discriminator.layers:
weights = l.get_weights()
weights = [np.clip(w, -0.01, 0.01) for w in weights]
return [d_loss, d_loss_real, d_loss_fake, d_acc, d_acc_real, d_acc_fake]
def train(self, x_train, batch_size, epochs, print_every_n_batches = 50, using_generator = False):
for epoch in range(self.epoch, self.epoch + epochs):
d = self.train_discriminator(x_train, batch_size, using_generator)
g = self.train_generator(batch_size)
if self.epoch % print_every_n_batches == 0:
print ("%d [D loss: (%.3f)(R %.3f, F %.3f)] [D acc: (%.3f)(%.3f, %.3f)] [G loss: %.3f] [G acc: %.3f]" % (epoch, d[0], d[1], d[2], d[3], d[4], d[5], g[0], g[1]))