PyTorch Boolean - Stop Backpropagation? - machine-learning

I need to create a Neural Network where I use binary gates to zero-out certain tensors, which are the output of disabled circuits.
To improve runtime speed, I was looking forward to use torch.bool binary gates to stop backpropagation along disabled circuits in the network. However, I created a small experiment using the official PyTorch example for the CIFAR-10 dataset, and the runtime speed is exactly the same for any values for gate_A and gate_B: (this means that the idea is not working)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.pool = nn.MaxPool2d(2, 2)
self.conv1a = nn.Conv2d(3, 6, 5)
self.conv2a = nn.Conv2d(6, 16, 5)
self.conv1b = nn.Conv2d(3, 6, 5)
self.conv2b = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(32 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Only one gate is supposed to be enabled at random
# However, for the experiment, I fixed the values to [1,0] and [1,1]
choice = randint(0,1)
gate_A = torch.tensor(choice ,dtype = torch.bool)
gate_B = torch.tensor(1-choice ,dtype = torch.bool)
a = self.pool(F.relu(self.conv1a(x)))
a = self.pool(F.relu(self.conv2a(a)))
b = self.pool(F.relu(self.conv1b(x)))
b = self.pool(F.relu(self.conv2b(b)))
a *= gate_A
b *= gate_B
x = torch.cat( [a,b], dim = 1 )
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
How can i define gate_A and gate_B in such a way that backpropagation effectively stops if they are zero?
PS. Changing concatenation dynamically at runtime would also change which weights are assigned to every module. (for example, the weights associated to a could be assigned to b in another pass, disrupting how the network operates).

You could use torch.no_grad (the code below can probably be made more concise):
def forward(self, x):
# Only one gate is supposed to be enabled at random
# However, for the experiment, I fixed the values to [1,0] and [1,1]
choice = randint(0,1)
gate_A = torch.tensor(choice ,dtype = torch.bool)
gate_B = torch.tensor(1-choice ,dtype = torch.bool)
if choice:
a = self.pool(F.relu(self.conv1a(x)))
a = self.pool(F.relu(self.conv2a(a)))
a *= gate_A
with torch.no_grad(): # disable gradient computation
b = self.pool(F.relu(self.conv1b(x)))
b = self.pool(F.relu(self.conv2b(b)))
b *= gate_B
else:
with torch.no_grad(): # disable gradient computation
a = self.pool(F.relu(self.conv1a(x)))
a = self.pool(F.relu(self.conv2a(a)))
a *= gate_A
b = self.pool(F.relu(self.conv1b(x)))
b = self.pool(F.relu(self.conv2b(b)))
b *= gate_B
x = torch.cat( [a,b], dim = 1 )
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
On a second look, I think the following is a simpler solution to the specific problem:
def forward(self, x):
# Only one gate is supposed to be enabled at random
# However, for the experiment, I fixed the values to [1,0] and [1,1]
choice = randint(0,1)
if choice:
a = self.pool(F.relu(self.conv1a(x)))
a = self.pool(F.relu(self.conv2a(a)))
b = torch.zeros(shape_of_conv_output) # replace shape of conv output here
else:
b = self.pool(F.relu(self.conv1b(x)))
b = self.pool(F.relu(self.conv2b(b)))
a = torch.zeros(shape_of_conv_output) # replace shape of conv output here
x = torch.cat( [a,b], dim = 1 )
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x

Easy solution, simply define a tensor with zeros when a or b are disabled :)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.pool = nn.MaxPool2d(2, 2)
self.conv1a = nn.Conv2d(3, 6, 5)
self.conv2a = nn.Conv2d(6, 16, 5)
self.conv1b = nn.Conv2d(3, 6, 5)
self.conv2b = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(32 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
if randint(0,1):
a = self.pool(F.relu(self.conv1a(x)))
a = self.pool(F.relu(self.conv2a(a)))
b = torch.zeros_like(a)
else:
b = self.pool(F.relu(self.conv1b(x)))
b = self.pool(F.relu(self.conv2b(b)))
a = torch.zeros_like(b)
x = torch.cat( [a,b], dim = 1 )
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
PS. I thought about this while I was having a coffee.

Related

Adjustment of CNN Architecture when size of input image is changed

I am working on a CNN for color classification problem in pytorch. This is the architecture of my CNN :
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 15)
def forward(self, x):
x = self.pool(F2.relu(self.conv1(x)))
x = self.pool(F2.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F2.relu(self.fc1(x))
x = F2.relu(self.fc2(x))
x = self.fc3(x)
return x
When images are resized to 32*32, the code works fine, but when the images are changed to different size, other than this, let's say 36*36, by transforms.Resize((36, 36)), it throws the following error :
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x576 and 400x120)
My question is how to adjust the CNN architecture, the layers and all when input image size is changed. Please help.
One way to achieve that is to make sure the spatial dimension is always the same before you flatten the intermediate tensor regardless of the input resolution. For example, by using the nn.AdaptiveAvgPool2d or nn.AdaptiveMaxPool2d. A concrete example will be:
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool1 = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16 * 5 * 5, 5)
self.pool2 = nn.AdaptiveAvgPool2d((1, 1)) # (B, C, H, W) -> (B, C, 1, 1)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 15)
def forward(self, x):
x = self.pool1(F2.relu(self.conv1(x)))
x = self.pool2(F2.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F2.relu(self.fc1(x))
x = F2.relu(self.fc2(x))
x = self.fc3(x)
return x
To compensate for the information loss caused by spatial resolution compression (i.e. pooling), we usually need to increase the channel size accordingly.

Dimension error in neural network model for classification

Below is the code for Hierarchical Attention Networks, taken from https://github.com/arunarn2/HierarchicalAttentionNetworks. The only difference in the code on the link and mine is that I have 3 classes for classification, whereas they are using 2
maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
#class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(HierarchicalAttentionNetwork, self).__init__()
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weightss = [self.W, self.b, self.u]
super(HierarchicalAttentionNetwork, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
weighted_input = x * K.expand_dims(ait)
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
input_length=maxlen, trainable=True, mask_zero=True)
sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)
review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(3, activation='softmax')(attn_sentence)
model = Model(review_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
print("model fitting - Hierachical attention network")
Following is the error I get. Please help me understand what the error means and how I can possibly resolve it.

Variational Autoencoder's sampling problem

My vae class looks like this:
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
c = capacity
self.conv1 = nn.Conv2d(in_channels=1, out_channels=c, kernel_size=4, stride=2, padding=1) # out: c x 14 x 14
self.conv2 = nn.Conv2d(in_channels=c, out_channels=c*2, kernel_size=4, stride=2, padding=1) # out: c x 7 x 7
self.fc_mu = nn.Linear(in_features=c*2*7*7, out_features=latent_dims)
self.fc_logvar = nn.Linear(in_features=c*2*7*7, out_features=latent_dims)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = x.view(x.size(0), -1) # flatten batch of multi-channel feature maps to a batch of feature vectors
x_mu = self.fc_mu(x)
x_logvar = self.fc_logvar(x)
return x_mu, x_logvar
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
c = capacity
self.fc = nn.Linear(in_features=latent_dims, out_features=c*2*7*7)
self.conv2 = nn.ConvTranspose2d(in_channels=c*2, out_channels=c, kernel_size=4, stride=2, padding=1)
self.conv1 = nn.ConvTranspose2d(in_channels=c, out_channels=1, kernel_size=4, stride=2, padding=1)
def forward(self, x):
x = self.fc(x)
x = x.view(x.size(0), capacity*2, 7, 7) # unflatten batch of feature vectors to a batch of multi-channel feature maps
x = F.relu(self.conv2(x))
x = torch.sigmoid(self.conv1(x)) # last layer before output is sigmoid, since we are using BCE as reconstruction loss
return x
class VariationalAutoencoder(nn.Module):
def __init__(self):
super(VariationalAutoencoder, self).__init__()
self.encoder = Encoder()
self.decoder = Decoder()
def forward(self, x):
latent_mu, latent_logvar = self.encoder(x)
latent = self.latent_sample(latent_mu, latent_logvar)
x_recon = self.decoder(latent)
return x_recon, latent_mu, latent_logvar
def latent_sample(self, mu, logvar):
if self.training:
# the reparameterization trick
std = logvar.mul(0.5).exp_()
eps = torch.empty_like(std).normal_()
return eps.mul(std).add_(mu)
else:
return mu
def vae_loss(recon_x, x, mu, logvar):
# recon_x is the probability of a multivariate Bernoulli distribution p.
# -log(p(x)) is then the pixel-wise binary cross-entropy.
# Averaging or not averaging the binary cross-entropy over all pixels here
# is a subtle detail with big effect on training, since it changes the weight
# we need to pick for the other loss term by several orders of magnitude.
# Not averaging is the direct implementation of the negative log likelihood,
# but averaging makes the weight of the other loss term independent of the image resolution.
recon_loss = F.binary_cross_entropy(recon_x.view(-1, 784), x.view(-1, 784), reduction='sum')
kldivergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return recon_loss + variational_beta * kldivergence
I train it on MNIST dataset.
I want to sample it, or generate an array and give it to the decoder and see what the output will be.
The problem is that I don't really understand, what my z array should look like and what shape should it need.
Here is the code for sampling:
z = ...
input = torch.FloatTensor(z).to(device)
vae.eval()
output = vae.decoder(input)
plot_gallery(output.data.cpu().numpy(), 24, 24, n_row=5, n_col=5)

Pytorch, slicing tensor causes RuntimeError:: one of the variables needed for gradient computation has been modified by an inplace operation:

I wrote a RNN with LSTM cell with Pycharm. The peculiarity of this network is that the output of the RNN is fed into a integration opeartion, computed with Runge-kutta.
The integration takes some input and propagate that in time one step ahead. In order to do so I need to slice the feature tensor X along the batch dimension, and pass this to the Runge-kutta.
class MyLSTM(torch.nn.Module):
def __init__(self, ni, no, sampling_interval, nh=10, nlayers=1):
super(MyLSTM, self).__init__()
self.device = torch.device("cpu")
self.dtype = torch.float
self.ni = ni
self.no = no
self.nh = nh
self.nlayers = nlayers
self.lstms = torch.nn.ModuleList(
[torch.nn.LSTMCell(self.ni, self.nh)] + [torch.nn.LSTMCell(self.nh, self.nh) for i in range(nlayers - 1)])
self.out = torch.nn.Linear(self.nh, self.no)
self.do = torch.nn.Dropout(p=0.2)
self.actfn = torch.nn.Sigmoid()
self.sampling_interval = sampling_interval
self.scaler_states = None
# Options
# description of the whole block
def forward(self, x, h0, train=False, integrate_ode=True):
x0 = x.clone().requires_grad_(True)
hs = x # initiate hidden state
if h0 is None:
h = torch.zeros(hs.shape[0], self.nh, device=self.device)
c = torch.zeros(hs.shape[0], self.nh, device=self.device)
else:
(h, c) = h0
# LSTM cells
for i in range(self.nlayers):
h, c = self.lstms[i](hs, (h, c))
if train:
hs = self.do(h)
else:
hs = h
# Output layer
# y = self.actfn(self.out(hs))
y = self.out(hs)
if integrate_ode:
p = y
y = self.integrate(x0, p)
return y, (h, c)
def integrate(self, x0, p):
# RK4 steps per interval
M = 4
DT = self.sampling_interval / M
X = x0
# X = self.scaler_features.inverse_transform(x0)
for b in range(X.shape[0]):
xx = X[b, :]
for j in range(M):
k1 = self.ode(xx, p[b, :])
k2 = self.ode(xx + DT / 2 * k1, p[b, :])
k3 = self.ode(xx + DT / 2 * k2, p[b, :])
k4 = self.ode(xx + DT * k3, p[b, :])
xx = xx + DT / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
X_all[b, :] = xx
return X_all
def ode(self, x0, y):
# Here I a dynamic model
I get this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of SelectBackward, is at version 64; expected version 63 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
the problem is in the operations xx = X[b, :] and p[b,:]. I know that because I choose batch dimension of 1, then I can replace the previous two equations with xx=X and p, and this works. How can split the tensor without loosing the gradient?
I had the same question, and after a lot of searching, I added .detach() function after "h" and "c" in the RNN cell.

Error does not drop beyond certain limit in my Neural Network

I have written the following neural network from scratch to classify 7-segment digits. However the log loss doesn't drop beyond a certain point. I am a newbie, please suggest if changes need to be made in model, learning, or any other area. Am I missing something?
All images are 9x5 images created in Opencv
import cv2
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss
from numpy import genfromtxt
import matplotlib.pyplot as plt
W1 = genfromtxt('W1.csv', delimiter=',')
W2 = genfromtxt('W2.csv', delimiter=',')
B1 = genfromtxt('B1.csv', delimiter=',')
B2 = genfromtxt('B2.csv', delimiter=',')
#initialise images as zero
zero = np.zeros((1,45))
one = np.zeros((1,45))
two = np.zeros((1,45))
three = np.zeros((1,45))
four = np.zeros((1,45))
five = np.zeros((1,45))
six = np.zeros((1,45))
seven = np.zeros((1,45))
eight = np.zeros((1,45))
nine = np.zeros((1,45))
def input_image():
zero = cv2.imread('0.png',0).reshape((1, 45))
one = cv2.imread('1.png',0).reshape((1, 45))
two = cv2.imread('2.png',0).reshape((1, 45))
three = cv2.imread('3.png',0).reshape((1, 45))
four = cv2.imread('4.png',0).reshape((1, 45))
five = cv2.imread('5.png',0).reshape((1, 45))
six = cv2.imread('6.png',0).reshape((1, 45))
seven = cv2.imread('7.png',0).reshape((1, 45))
eight = cv2.imread('8.png',0).reshape((1, 45))
nine = cv2.imread('9.png',0).reshape((1, 45))
input_image()
X_train = np.zeros((10,45))
X_train[0] = zero
X_train[1] = one
X_train[2] = two
X_train[3] = three
X_train[4] = four
X_train[5] = five
X_train[6] = six
X_train[7] = seven
X_train[8] = eight
X_train[9] = nine
Y_train = np.array([[1,0,0,0,0,0,0,0,0,0], [0,1,0,0,0,0,0,0,0,0], [0,0,1,0,0,0,0,0,0,0], [0,0,0,1,0,0,0,0,0,0], [0,0,0,0,1,0,0,0,0,0], [0,0,0,0,0,1,0,0,0,0], [0,0,0,0,0,0,1,0,0,0], [0,0,0,0,0,0,0,1,0,0], [0,0,0,0,0,0,0,0,1,0], [0,0,0,0,0,0,0,0,0,1]])
loss = []
#FF class
class FeedForward:
def __init__(self, W1, W2, B1, B2):
self.W1 = W1.copy() #W1 -> (45,7)
self.W2 = W2.copy() #W2 -> (7,10)
self.B1 = B1.copy()
self.B2 = B2.copy()
def sigmoid(self, X):
return 1.0/(1.0 + np.exp(-X))
def softmax(self, X):
exps = np.exp(X)
return exps / np.sum(exps, axis=1).reshape(-1,1)
def forward_pass(self, X):
self.A1 = np.matmul(X,self.W1) + self.B1 # (10, 45) * (45, 7) -> (10, 7)
self.H1 = self.sigmoid(self.A1) # (10, 7)
self.A2 = np.matmul(self.H1, self.W2) # (10, 7) * (7, 10) -> (10, 10)
self.H2 = self.softmax(self.A2) # (10, 10)
return self.H2
def grad_sigmoid(self, X):
return X*(1-X)
def grad(self, X, Y):
self.forward_pass(X)
m = X.shape[0]
self.dA2 = self.H2 - Y # (10, 10) - (10, 10) -> (10, 10)
self.dW2 = np.matmul(self.H1.T, self.dA2) # (7, 10) * (10, 10) -> (7, 10)
self.dB2 = np.sum(self.dA2, axis=0).reshape(1, -1) # (10, 10) -> (1, 10)
self.dH1 = np.matmul(self.dA2, self.W2.T) # (10, 10) * (10, 7) -> (10, 7)
self.dA1 = np.multiply(self.dH1, self.grad_sigmoid(self.H1)) # (10, 7) .* (10, 7) -> (10, 7)
self.dW1 = np.matmul(X.T, self.dA1) # (45,10) * (10, 7) -> (45, 7)
self.dB1 = np.sum(self.dA1, axis=0).reshape(1, -1) # (10, 7) -> (1, 7)
def fit(self, X, Y, epochs=1, learning_rate=1, display_loss=True):
#if display_loss:
# #loss = {}
for i in range(epochs):
self.grad(X, Y) # X -> (10, 45), Y -> (10, 10)
m = X.shape[0]
self.W2 = self.W2 - learning_rate * (self.dW2)
self.B2 = self.B2 - learning_rate * (self.dB2)
self.W1 = self.W1 - learning_rate * (self.dW1)
self.B1 = self.B1 - learning_rate * (self.dB1)
if display_loss:
Y_pred = self.predict(X)
loss.append(log_loss(np.argmax(Y, axis=1), Y_pred))
#if display_loss:
# plt.plot(loss.values())
# plt.xlabel('Epochs')
# plt.ylabel('Log Loss')
# plt.show()
def predict(self, X):
Y_pred = self.forward_pass(X)
return np.array(Y_pred).squeeze()
p1 = FeedForward(W1,W2,B1,B2)
iter = 20
while(iter>0):
print ("\nStarting epochs")
p1.fit(X_train,Y_train,epochs=20,learning_rate=0.05)
print ("\nSaving Weights and Biases")
iter = iter-1
np.savetxt("W1.csv", W1, delimiter=",")
np.savetxt("W2.csv", W2, delimiter=",")
np.savetxt("B1.csv", B1, delimiter=",")
np.savetxt("B2.csv", B2, delimiter=",")
plt.plot(loss)
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.show()

Resources