I am working on a CNN for color classification problem in pytorch. This is the architecture of my CNN :
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 15)
def forward(self, x):
x = self.pool(F2.relu(self.conv1(x)))
x = self.pool(F2.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F2.relu(self.fc1(x))
x = F2.relu(self.fc2(x))
x = self.fc3(x)
return x
When images are resized to 32*32, the code works fine, but when the images are changed to different size, other than this, let's say 36*36, by transforms.Resize((36, 36)), it throws the following error :
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x576 and 400x120)
My question is how to adjust the CNN architecture, the layers and all when input image size is changed. Please help.
One way to achieve that is to make sure the spatial dimension is always the same before you flatten the intermediate tensor regardless of the input resolution. For example, by using the nn.AdaptiveAvgPool2d or nn.AdaptiveMaxPool2d. A concrete example will be:
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool1 = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16 * 5 * 5, 5)
self.pool2 = nn.AdaptiveAvgPool2d((1, 1)) # (B, C, H, W) -> (B, C, 1, 1)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 15)
def forward(self, x):
x = self.pool1(F2.relu(self.conv1(x)))
x = self.pool2(F2.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F2.relu(self.fc1(x))
x = F2.relu(self.fc2(x))
x = self.fc3(x)
return x
To compensate for the information loss caused by spatial resolution compression (i.e. pooling), we usually need to increase the channel size accordingly.
Below is the code for Hierarchical Attention Networks, taken from https://github.com/arunarn2/HierarchicalAttentionNetworks. The only difference in the code on the link and mine is that I have 3 classes for classification, whereas they are using 2
maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
#class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(HierarchicalAttentionNetwork, self).__init__()
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weightss = [self.W, self.b, self.u]
super(HierarchicalAttentionNetwork, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
weighted_input = x * K.expand_dims(ait)
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
input_length=maxlen, trainable=True, mask_zero=True)
sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)
review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(3, activation='softmax')(attn_sentence)
model = Model(review_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
print("model fitting - Hierachical attention network")
Following is the error I get. Please help me understand what the error means and how I can possibly resolve it.
My vae class looks like this:
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
c = capacity
self.conv1 = nn.Conv2d(in_channels=1, out_channels=c, kernel_size=4, stride=2, padding=1) # out: c x 14 x 14
self.conv2 = nn.Conv2d(in_channels=c, out_channels=c*2, kernel_size=4, stride=2, padding=1) # out: c x 7 x 7
self.fc_mu = nn.Linear(in_features=c*2*7*7, out_features=latent_dims)
self.fc_logvar = nn.Linear(in_features=c*2*7*7, out_features=latent_dims)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = x.view(x.size(0), -1) # flatten batch of multi-channel feature maps to a batch of feature vectors
x_mu = self.fc_mu(x)
x_logvar = self.fc_logvar(x)
return x_mu, x_logvar
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
c = capacity
self.fc = nn.Linear(in_features=latent_dims, out_features=c*2*7*7)
self.conv2 = nn.ConvTranspose2d(in_channels=c*2, out_channels=c, kernel_size=4, stride=2, padding=1)
self.conv1 = nn.ConvTranspose2d(in_channels=c, out_channels=1, kernel_size=4, stride=2, padding=1)
def forward(self, x):
x = self.fc(x)
x = x.view(x.size(0), capacity*2, 7, 7) # unflatten batch of feature vectors to a batch of multi-channel feature maps
x = F.relu(self.conv2(x))
x = torch.sigmoid(self.conv1(x)) # last layer before output is sigmoid, since we are using BCE as reconstruction loss
return x
class VariationalAutoencoder(nn.Module):
def __init__(self):
super(VariationalAutoencoder, self).__init__()
self.encoder = Encoder()
self.decoder = Decoder()
def forward(self, x):
latent_mu, latent_logvar = self.encoder(x)
latent = self.latent_sample(latent_mu, latent_logvar)
x_recon = self.decoder(latent)
return x_recon, latent_mu, latent_logvar
def latent_sample(self, mu, logvar):
if self.training:
# the reparameterization trick
std = logvar.mul(0.5).exp_()
eps = torch.empty_like(std).normal_()
return eps.mul(std).add_(mu)
else:
return mu
def vae_loss(recon_x, x, mu, logvar):
# recon_x is the probability of a multivariate Bernoulli distribution p.
# -log(p(x)) is then the pixel-wise binary cross-entropy.
# Averaging or not averaging the binary cross-entropy over all pixels here
# is a subtle detail with big effect on training, since it changes the weight
# we need to pick for the other loss term by several orders of magnitude.
# Not averaging is the direct implementation of the negative log likelihood,
# but averaging makes the weight of the other loss term independent of the image resolution.
recon_loss = F.binary_cross_entropy(recon_x.view(-1, 784), x.view(-1, 784), reduction='sum')
kldivergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return recon_loss + variational_beta * kldivergence
I train it on MNIST dataset.
I want to sample it, or generate an array and give it to the decoder and see what the output will be.
The problem is that I don't really understand, what my z array should look like and what shape should it need.
Here is the code for sampling:
z = ...
input = torch.FloatTensor(z).to(device)
vae.eval()
output = vae.decoder(input)
plot_gallery(output.data.cpu().numpy(), 24, 24, n_row=5, n_col=5)
I wrote a RNN with LSTM cell with Pycharm. The peculiarity of this network is that the output of the RNN is fed into a integration opeartion, computed with Runge-kutta.
The integration takes some input and propagate that in time one step ahead. In order to do so I need to slice the feature tensor X along the batch dimension, and pass this to the Runge-kutta.
class MyLSTM(torch.nn.Module):
def __init__(self, ni, no, sampling_interval, nh=10, nlayers=1):
super(MyLSTM, self).__init__()
self.device = torch.device("cpu")
self.dtype = torch.float
self.ni = ni
self.no = no
self.nh = nh
self.nlayers = nlayers
self.lstms = torch.nn.ModuleList(
[torch.nn.LSTMCell(self.ni, self.nh)] + [torch.nn.LSTMCell(self.nh, self.nh) for i in range(nlayers - 1)])
self.out = torch.nn.Linear(self.nh, self.no)
self.do = torch.nn.Dropout(p=0.2)
self.actfn = torch.nn.Sigmoid()
self.sampling_interval = sampling_interval
self.scaler_states = None
# Options
# description of the whole block
def forward(self, x, h0, train=False, integrate_ode=True):
x0 = x.clone().requires_grad_(True)
hs = x # initiate hidden state
if h0 is None:
h = torch.zeros(hs.shape[0], self.nh, device=self.device)
c = torch.zeros(hs.shape[0], self.nh, device=self.device)
else:
(h, c) = h0
# LSTM cells
for i in range(self.nlayers):
h, c = self.lstms[i](hs, (h, c))
if train:
hs = self.do(h)
else:
hs = h
# Output layer
# y = self.actfn(self.out(hs))
y = self.out(hs)
if integrate_ode:
p = y
y = self.integrate(x0, p)
return y, (h, c)
def integrate(self, x0, p):
# RK4 steps per interval
M = 4
DT = self.sampling_interval / M
X = x0
# X = self.scaler_features.inverse_transform(x0)
for b in range(X.shape[0]):
xx = X[b, :]
for j in range(M):
k1 = self.ode(xx, p[b, :])
k2 = self.ode(xx + DT / 2 * k1, p[b, :])
k3 = self.ode(xx + DT / 2 * k2, p[b, :])
k4 = self.ode(xx + DT * k3, p[b, :])
xx = xx + DT / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
X_all[b, :] = xx
return X_all
def ode(self, x0, y):
# Here I a dynamic model
I get this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of SelectBackward, is at version 64; expected version 63 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
the problem is in the operations xx = X[b, :] and p[b,:]. I know that because I choose batch dimension of 1, then I can replace the previous two equations with xx=X and p, and this works. How can split the tensor without loosing the gradient?
I had the same question, and after a lot of searching, I added .detach() function after "h" and "c" in the RNN cell.
I have written the following neural network from scratch to classify 7-segment digits. However the log loss doesn't drop beyond a certain point. I am a newbie, please suggest if changes need to be made in model, learning, or any other area. Am I missing something?
All images are 9x5 images created in Opencv
import cv2
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss
from numpy import genfromtxt
import matplotlib.pyplot as plt
W1 = genfromtxt('W1.csv', delimiter=',')
W2 = genfromtxt('W2.csv', delimiter=',')
B1 = genfromtxt('B1.csv', delimiter=',')
B2 = genfromtxt('B2.csv', delimiter=',')
#initialise images as zero
zero = np.zeros((1,45))
one = np.zeros((1,45))
two = np.zeros((1,45))
three = np.zeros((1,45))
four = np.zeros((1,45))
five = np.zeros((1,45))
six = np.zeros((1,45))
seven = np.zeros((1,45))
eight = np.zeros((1,45))
nine = np.zeros((1,45))
def input_image():
zero = cv2.imread('0.png',0).reshape((1, 45))
one = cv2.imread('1.png',0).reshape((1, 45))
two = cv2.imread('2.png',0).reshape((1, 45))
three = cv2.imread('3.png',0).reshape((1, 45))
four = cv2.imread('4.png',0).reshape((1, 45))
five = cv2.imread('5.png',0).reshape((1, 45))
six = cv2.imread('6.png',0).reshape((1, 45))
seven = cv2.imread('7.png',0).reshape((1, 45))
eight = cv2.imread('8.png',0).reshape((1, 45))
nine = cv2.imread('9.png',0).reshape((1, 45))
input_image()
X_train = np.zeros((10,45))
X_train[0] = zero
X_train[1] = one
X_train[2] = two
X_train[3] = three
X_train[4] = four
X_train[5] = five
X_train[6] = six
X_train[7] = seven
X_train[8] = eight
X_train[9] = nine
Y_train = np.array([[1,0,0,0,0,0,0,0,0,0], [0,1,0,0,0,0,0,0,0,0], [0,0,1,0,0,0,0,0,0,0], [0,0,0,1,0,0,0,0,0,0], [0,0,0,0,1,0,0,0,0,0], [0,0,0,0,0,1,0,0,0,0], [0,0,0,0,0,0,1,0,0,0], [0,0,0,0,0,0,0,1,0,0], [0,0,0,0,0,0,0,0,1,0], [0,0,0,0,0,0,0,0,0,1]])
loss = []
#FF class
class FeedForward:
def __init__(self, W1, W2, B1, B2):
self.W1 = W1.copy() #W1 -> (45,7)
self.W2 = W2.copy() #W2 -> (7,10)
self.B1 = B1.copy()
self.B2 = B2.copy()
def sigmoid(self, X):
return 1.0/(1.0 + np.exp(-X))
def softmax(self, X):
exps = np.exp(X)
return exps / np.sum(exps, axis=1).reshape(-1,1)
def forward_pass(self, X):
self.A1 = np.matmul(X,self.W1) + self.B1 # (10, 45) * (45, 7) -> (10, 7)
self.H1 = self.sigmoid(self.A1) # (10, 7)
self.A2 = np.matmul(self.H1, self.W2) # (10, 7) * (7, 10) -> (10, 10)
self.H2 = self.softmax(self.A2) # (10, 10)
return self.H2
def grad_sigmoid(self, X):
return X*(1-X)
def grad(self, X, Y):
self.forward_pass(X)
m = X.shape[0]
self.dA2 = self.H2 - Y # (10, 10) - (10, 10) -> (10, 10)
self.dW2 = np.matmul(self.H1.T, self.dA2) # (7, 10) * (10, 10) -> (7, 10)
self.dB2 = np.sum(self.dA2, axis=0).reshape(1, -1) # (10, 10) -> (1, 10)
self.dH1 = np.matmul(self.dA2, self.W2.T) # (10, 10) * (10, 7) -> (10, 7)
self.dA1 = np.multiply(self.dH1, self.grad_sigmoid(self.H1)) # (10, 7) .* (10, 7) -> (10, 7)
self.dW1 = np.matmul(X.T, self.dA1) # (45,10) * (10, 7) -> (45, 7)
self.dB1 = np.sum(self.dA1, axis=0).reshape(1, -1) # (10, 7) -> (1, 7)
def fit(self, X, Y, epochs=1, learning_rate=1, display_loss=True):
#if display_loss:
# #loss = {}
for i in range(epochs):
self.grad(X, Y) # X -> (10, 45), Y -> (10, 10)
m = X.shape[0]
self.W2 = self.W2 - learning_rate * (self.dW2)
self.B2 = self.B2 - learning_rate * (self.dB2)
self.W1 = self.W1 - learning_rate * (self.dW1)
self.B1 = self.B1 - learning_rate * (self.dB1)
if display_loss:
Y_pred = self.predict(X)
loss.append(log_loss(np.argmax(Y, axis=1), Y_pred))
#if display_loss:
# plt.plot(loss.values())
# plt.xlabel('Epochs')
# plt.ylabel('Log Loss')
# plt.show()
def predict(self, X):
Y_pred = self.forward_pass(X)
return np.array(Y_pred).squeeze()
p1 = FeedForward(W1,W2,B1,B2)
iter = 20
while(iter>0):
print ("\nStarting epochs")
p1.fit(X_train,Y_train,epochs=20,learning_rate=0.05)
print ("\nSaving Weights and Biases")
iter = iter-1
np.savetxt("W1.csv", W1, delimiter=",")
np.savetxt("W2.csv", W2, delimiter=",")
np.savetxt("B1.csv", B1, delimiter=",")
np.savetxt("B2.csv", B2, delimiter=",")
plt.plot(loss)
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.show()