I have a model that takes one image as input and it works fine. now I want to give one more transformed image with the same dimensions as the first one as input to the model. The model should learn from both images. The below code shows an error: "init() takes 2 positional arguments but 3 were given". I want to know why I have to give two inputs in the init function as well.
class MyNet(nn.Module):
def __init__(self, input_dim):
super(MyNet, self).__init__()
self.conv1 = nn.Conv2d(input_dim, nChannel, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(nChannel)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv2 = []
self.bn2 = []
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
for i in range(nConv - 1):
self.conv2.append(nn.Conv2d(nChannel, nChannel, kernel_size=3, stride=1, padding=1))
self.conv3 = nn.Conv2d(nChannel, nChannel, kernel_size=1, stride=1, padding=0)
self.bn3 = nn.BatchNorm2d(nChannel)
self.UB1 = nn.UpsamplingBilinear2d(scale_factor=2)
self.deconv = nn.ConvTranspose2d(nChannel, nChannel, kernel_size=3, stride=1, padding=1)
def forward(self, x1, x2):
x1 = self.conv1(x1)
x1 = F.relu(x1)
x1 = self.bn1(x1)
x1 = self.pool1(x1)
for i in range(nConv - 1):
x1 = self.conv2[i](x1)
x1 = F.relu(x1)
x1 = self.bn2[i](x1)
if i == 0:
x1 = self.pool2(x1)
x1 = self.conv3(x1)
# x = F.relu(x)
x1 = self.bn3(x1)
x1 = self.UB1(x1)
x1 = self.deconv(x1)
x1 = F.relu(x1)
x1 = self.bn3(x1)
x1 = self.UB1(x1)
x1 = self.deconv(x1)
x1 = F.relu(x1)
x1 = self.bn3(x1)
x2 = self.conv1(x2)
x2 = F.relu(x2)
x2 = self.bn1(x2)
x2 = self.pool1(x2)
for i in range(nConv - 1):
x2 = self.conv2[i](x2)
x2 = F.relu(x2)
x2 = self.bn2[i](x2)
if i == 0:
x2 = self.pool2(x2)
x2 = self.conv3(x2)
# x = F.relu(x)
x2 = self.bn3(x2)
x2 = self.UB1(x2)
x2 = self.deconv(x2)
x2 = F.relu(x2)
x2 = self.bn3(x2)
x2 = self.UB1(x2)
x2 = self.deconv(x2)
x2 = F.relu(x2)
x2 = self.bn3(x2)
x =,x2)
return x

You've mentioned that 2 input images have the same size, so no need to pass 2 values on model instantiating (model = MyNet(input_dim)).
But forward then requires 2 inputs in your case it looks ok
# init passing only single value `input_dim`
model = MyNet(input_dim)
# pseudo code for training stage
for batch in data_loader:
img1, img2 = batch
output = model(img1, img2)


Pytorch, slicing tensor causes RuntimeError:: one of the variables needed for gradient computation has been modified by an inplace operation:

I wrote a RNN with LSTM cell with Pycharm. The peculiarity of this network is that the output of the RNN is fed into a integration opeartion, computed with Runge-kutta.
The integration takes some input and propagate that in time one step ahead. In order to do so I need to slice the feature tensor X along the batch dimension, and pass this to the Runge-kutta.
class MyLSTM(torch.nn.Module):
def __init__(self, ni, no, sampling_interval, nh=10, nlayers=1):
super(MyLSTM, self).__init__()
self.device = torch.device("cpu")
self.dtype = torch.float = ni = no
self.nh = nh
self.nlayers = nlayers
self.lstms = torch.nn.ModuleList(
[torch.nn.LSTMCell(, self.nh)] + [torch.nn.LSTMCell(self.nh, self.nh) for i in range(nlayers - 1)])
self.out = torch.nn.Linear(self.nh, = torch.nn.Dropout(p=0.2)
self.actfn = torch.nn.Sigmoid()
self.sampling_interval = sampling_interval
self.scaler_states = None
# Options
# description of the whole block
def forward(self, x, h0, train=False, integrate_ode=True):
x0 = x.clone().requires_grad_(True)
hs = x # initiate hidden state
if h0 is None:
h = torch.zeros(hs.shape[0], self.nh, device=self.device)
c = torch.zeros(hs.shape[0], self.nh, device=self.device)
(h, c) = h0
# LSTM cells
for i in range(self.nlayers):
h, c = self.lstms[i](hs, (h, c))
if train:
hs =
hs = h
# Output layer
# y = self.actfn(self.out(hs))
y = self.out(hs)
if integrate_ode:
p = y
y = self.integrate(x0, p)
return y, (h, c)
def integrate(self, x0, p):
# RK4 steps per interval
M = 4
DT = self.sampling_interval / M
X = x0
# X = self.scaler_features.inverse_transform(x0)
for b in range(X.shape[0]):
xx = X[b, :]
for j in range(M):
k1 = self.ode(xx, p[b, :])
k2 = self.ode(xx + DT / 2 * k1, p[b, :])
k3 = self.ode(xx + DT / 2 * k2, p[b, :])
k4 = self.ode(xx + DT * k3, p[b, :])
xx = xx + DT / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
X_all[b, :] = xx
return X_all
def ode(self, x0, y):
# Here I a dynamic model
I get this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of SelectBackward, is at version 64; expected version 63 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
the problem is in the operations xx = X[b, :] and p[b,:]. I know that because I choose batch dimension of 1, then I can replace the previous two equations with xx=X and p, and this works. How can split the tensor without loosing the gradient?
I had the same question, and after a lot of searching, I added .detach() function after "h" and "c" in the RNN cell.

Wasserstein GAN problem with last discriminator layer and clipping

When I use linear or No activation in the last Discriminator layer using weight clipping Discriminator accuracy goes to 1 and Generator goes to 0. In case when I remove weight clipping, Generator accuracy goes to 1 and discriminator goes to 0 around 300 iterations. But when I use sigmoid activation as the last layer in the discriminator with clipping Generator accuracy goes to 1 and without clipping the generator loss get stuck while accuracies going as they should around 0.5.
NOTE - in all cases, results are produced and all of the show WARNING:tensorflow:Discrepancy between trainable weights and collected trainable weights, did you set model.trainable without calling model.compile after ?
Code is given here, please do not mind the indentation on copying and pasting it's everywhere -
class WGAN():
def __init__(self,
self.input_dim = input_dim
self.disc_filter = disc_filter
self.disc_kernel = disc_kernel
self.disc_strides = disc_strides
self.disc_dropout = disc_dropout
self.disc_lr = disc_lr
self.gen_filter = gen_filter
self.gen_kernel = gen_kernel
self.gen_strides = gen_strides
self.gen_upsample = gen_upsample
self.gen_lr = gen_lr
self.z_dim = z_dim
self.batch_size = batch_size
self.weight_init = RandomNormal(mean=0., stddev=0.02)
self.d_losses = []
self.g_losses = []
self.epoch = 0
def wasserstein(self, y_true, y_pred):
return -K.mean(y_true * y_pred)
def Discriminator(self):
disc_input = Input(shape=self.input_dim, name='discriminator_input')
x = disc_input
for i in range(len(self.disc_filter)):
x = Conv2D(filters=self.disc_filter[i], kernel_size=self.disc_kernel[i], strides=self.disc_strides[i], padding='same', name='disc_'+str(i))(x)
x = LeakyReLU()(x)
x = Dropout(self.disc_dropout)(x)
x = BatchNormalization()(x)
x = Flatten()(x)
disc_output = Dense(1, activation='sigmoid', kernel_initializer = self.weight_init)(x)
self.discriminator = Model(disc_input, disc_output)
def Generator(self):
gen_input = Input(shape=(self.z_dim,), name='generator_input')
x = gen_input
x = Dense(7*7*self.batch_size, kernel_initializer = self.weight_init)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = Reshape(target_shape=(7,7,self.batch_size))(x)
for i in range(len(self.gen_filter)):
if self.gen_upsample[i]==2:
x = UpSampling2D(size=self.gen_upsample[i], name='upsample_'+str(i/2))(x)
x = Conv2D(filters=self.gen_filter[i], kernel_size=self.gen_kernel[i], strides=self.gen_strides[i], padding='same', name='gen_'+str(i))(x)
x = Conv2DTranspose(filters=self.gen_filter[i], kernel_size=self.gen_kernel[i], strides=self.gen_strides[i], padding='same', name='gen_'+str(i))(x)
if i<len(self.gen_filter)-1:
x = BatchNormalization()(x)
x = LeakyReLU()(x)
x = Activation("tanh")(x)
gen_output = x
self.generator = Model(gen_input, gen_output)
def set_trainable(self, model, val):
for l in model.layers:
def full_model(self):
self.discriminator.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
self.set_trainable(self.discriminator, False)
self.discriminator.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
model_input = Input(shape=(self.z_dim,), name='model_input')
model_output = self.discriminator(self.generator(model_input))
self.model = Model(model_input, model_output)
self.model.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
self.set_trainable(self.discriminator, True)
def train_generator(self, batch_size):
valid = np.ones((batch_size,1))
noise = np.random.normal(0, 1, (batch_size, self.z_dim))
return self.model.train_on_batch(noise, valid)
def train_discriminator(self, x_train, batch_size, using_generator):
valid = np.ones((batch_size,1))
fake = np.zeros((batch_size,1))
if using_generator:
true_imgs = next(x_train)[0]
if true_imgs.shape[0] != batch_size:
true_imgs = next(x_train)[0]
idx = np.random.randint(0, x_train.shape[0], batch_size)
true_imgs = x_train[idx]
noise = np.random.normal(0, 1, (batch_size, self.z_dim))
gen_imgs = self.generator.predict(noise)
d_loss_real, d_acc_real = self.discriminator.train_on_batch(true_imgs, valid)
d_loss_fake, d_acc_fake = self.discriminator.train_on_batch(gen_imgs, fake)
d_loss = 0.5 * (d_loss_real + d_loss_fake)
d_acc = 0.5 * (d_acc_real + d_acc_fake)
for l in self.discriminator.layers:
weights = l.get_weights()
weights = [np.clip(w, -0.01, 0.01) for w in weights]
return [d_loss, d_loss_real, d_loss_fake, d_acc, d_acc_real, d_acc_fake]
def train(self, x_train, batch_size, epochs, print_every_n_batches = 50, using_generator = False):
for epoch in range(self.epoch, self.epoch + epochs):
d = self.train_discriminator(x_train, batch_size, using_generator)
g = self.train_generator(batch_size)
if self.epoch % print_every_n_batches == 0:
print ("%d [D loss: (%.3f)(R %.3f, F %.3f)] [D acc: (%.3f)(%.3f, %.3f)] [G loss: %.3f] [G acc: %.3f]" % (epoch, d[0], d[1], d[2], d[3], d[4], d[5], g[0], g[1]))

Gradient Descent cost function explosion

I am writing this code for linear regression and trying Gradient Descent to minimize the RSS. The cost function seems to explode to infinity within 12 iterations. I know this is not supposed to happen. Maybe, I have used the wrong gradient function for RSS (can be seen in the function "grad()")?
X = np.random.uniform(minVal,maxVal,(NumberObservations,1))
e = np.random.normal(0, 1, (NumberObservations,1))
Y= 10 + 5*X + e
B = np.array([[0], [0]])
sum_y = sum(Y)
sum_x = sum(X)
sum_xy = sum(np.multiply(X, Y))
sum_x2 = sum(X*X)
alpha = 0.00001
iterations = 15
def cost_fun(X, Y, B):
b0 = B[0]
b1 = B[1]
s = (Y - (b0 + (b1*X)))**2
rss = sum(s)
return rss
def grad(X, Y, B):
print("B = " + str(B))
b0 = B[0]
b1 = B[1]
g0 = -2*(Y - b0 - (b1*X))
g1 = -2*((X*Y) - (b0*X) - (b1*X**2))
grad = np.concatenate((g0, g1), axis = 1)
return grad
def gradient_descent(X, Y, B, alpha, iterations):
cost_history = [0] * iterations
m = len(Y)
x0 = np.array(np.ones(m))
x0 = x0.reshape((100, 1))
X1 = np.concatenate((x0, X), axis = 1)
for iteration in range(iterations):
h =, B)
h = h.reshape((100, 1))
loss = h - Y
g = grad(X, Y, B)
gradient = (, loss) / m)
B = B - alpha * gradient
cost = cost_fun(X, Y, B)
cost_history[iteration] = cost
print("Iteration %d | Cost: %f" % (iteration, cost))
return B, cost_history
newB, cost_history = gradient_descent(X, Y, B, alpha, iterations)
# New Values of B
Please help.

No gradients provided for any variable

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
Nclass = 500
D = 2
M = 3
K = 3
X1 = np.random.randn(Nclass, D) + np.array([0, -2])
X2 = np.random.randn(Nclass, D) + np.array([2, 2])
X3 = np.random.randn(Nclass, D) + np.array([-2, 2])
X = np.vstack ([X1, X2, X3]).astype(np.float32)
Y = np.array([0]*Nclass + [1]*Nclass + [2]*Nclass)
plt.scatter(X[:,0], X[:,1], c=Y, s=100, alpha=0.5)
N = len(Y)
T = np.zeros((N, K))
for i in range(N):
T[i, Y[i]] = 1
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
def forward(X, W1, b1, W2, b2):
Z = tf.nn.sigmoid(tf.matmul(X, W1) + b1)
return tf.matmul(Z, W2) + b2
tfX = tf.placeholder(tf.float32, [None, D])
tfY = tf.placeholder(tf.float32, [None, K])
W1 = init_weights([D, M])
b1 = init_weights([M])
W2 = init_weights([M, K])
b2 = init_weights([K])
py_x = forward(tfX, W1, b1, W2, b2)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=py_x, logits=T))
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost)
predict_op = tf.argmax(py_x, 1)
sess = tf.Session()
inti = tf.initizalize_all_variables()
for i in range(1000):, feed_dict={tfX: X, tfY: T})
pred =, feed_dict={tfX: X, tfY: T})
if i % 10 == 0:
print(np.mean(Y == pred))
I have a little issue :
Traceback (most recent call last):
File "", line 45, in <module>
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/", line 322, in minimize
([str(v) for _, v in grads_and_vars], loss))
ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32_ref>", "<tf.Variable 'Variable_1:0' shape=(3,) dtype=float32_ref>", "<tf.Variable 'Variable_2:0' shape=(3, 3) dtype=float32_ref>", "<tf.Variable 'Variable_3:0' shape=(3,) dtype=float32_ref>"] and loss Tensor("Mean:0", shape=(), dtype=float64).
It is unclear what I have to do here. Could anyone be able to help me at this point?
If T are the true labels and py_x the network outputs, you will have to switch the arguments in the cross entropy function:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=T, logits=py_x))
The logits must be the network outputs and the labels must be the true labels. If you confuse the arguments, the optimizer will fail to backpropagate, since there will be no gradient.
You also have to initialize your variables before training; your code lacks a statement (you also had a typo in your initialize_all_variables().
I also shuffled your data; maybe it will lead to faster convergence towards the labels.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
Nclass = 500
D = 2
M = 3
K = 3
X1 = np.random.randn(Nclass, D) + np.array([0, -2])
X2 = np.random.randn(Nclass, D) + np.array([2, 2])
X3 = np.random.randn(Nclass, D) + np.array([-2, 2])
X = np.vstack ([X1, X2, X3]).astype(np.float32)
Y = np.array([0]*Nclass + [1]*Nclass + [2]*Nclass)
perm = np.random.permutation(len(X))
X = X[perm]
Y = Y[perm]
# plt.scatter(X[:,0], X[:,1], c=Y, s=100, alpha=0.5)
N = len(Y)
T = np.zeros((N, K))
for i in range(N):
T[i, Y[i]] = 1
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
def forward(X, W1, b1, W2, b2):
Z = tf.nn.sigmoid(tf.matmul(X, W1) + b1)
return tf.matmul(Z, W2) + b2
tfX = tf.placeholder(tf.float32, [None, D])
tfY = tf.placeholder(tf.float32, [None, K])
W1 = init_weights([D, M])
b1 = init_weights([M])
W2 = init_weights([M, K])
b2 = init_weights([K])
py_x = forward(tfX, W1, b1, W2, b2)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=T, logits=py_x))
train_op = tf.train.GradientDescentOptimizer(0.1).minimize(cost)
predict_op = tf.argmax(py_x, 1)
sess = tf.Session()
init = tf.initialize_all_variables()
for i in range(1000):, feed_dict={tfX: X, tfY: T})
pred =, feed_dict={tfX: X, tfY: T})
if i % 10 == 0:
print(np.mean(Y == pred))
It figured out that you should run inti i.e.
inti = tf.initialize_all_variables()
before running the GradientDescentOptimizer

Language Modelling with RNN and LSTM Cell in Tensorflow

My RNN for language modelling is predicting only "the" "and" and "unknown" what's wrong with my code?
Here I define the hyper parameters:
num_epochs = 300
total_series_length = len(uniqueSentence) - 4
truncated_backprop_length = 30
state_size = 100
num_classes = NUM_MEANINGFUL + 1
echo_step = 1
batch_size = 32
vocab_length = len(decoder)
num_batches = total_series_length//batch_size//truncated_backprop_length
learning_rate = 0.01
old_perplexity = 0
Here I generate the data (my input is given by word embeddings long 100 calculated with Word2Vec):
def generateData():
uniqueSent = uniqueSentence[0 : len(uniqueSentence) - 4]
x_tr = np.array([model_ted[word] for words in uniqueSent])
#Roll array elements along a given axis.
#Elements that roll beyond the last position are re-introduced at the first.
x_tr = x_tr.reshape((100, batch_size, -1)) # The first index changing slowest, subseries as rows
x = x_tr.transpose((1, 2, 0))
new_y = indexList[1: len(indexList)- 4]
y = np.array(new_y)
y = y.reshape((batch_size, -1))
return (x, y)
Define the placeholders:
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, truncated_backprop_length, 100])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, truncated_backprop_length])
W = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b = tf.Variable(np.zeros((batch_size, num_classes)), dtype=tf.float32)
W2 = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b2 = tf.Variable(np.zeros((batch_size, num_classes)), dtype=tf.float32)
Inputs and desired outputs:
labels_series = tf.transpose(batchY_placeholder)
labels_series = tf.unstack(batchY_placeholder, axis=1)
inputs_series = batchX_placeholder
Forward pass:
from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
#cell = tf.contrib.rnn.BasicRNNCell(state_size)
cell = tf.contrib.rnn.BasicLSTMCell(state_size, state_is_tuple = False)
init_state = tf.zeros([batch_size, cell.state_size])
outputs, current_state = tf.nn.dynamic_rnn(cell, inputs_series, initial_state = init_state)
iterable_outputs = tf.unstack(outputs, axis = 1)
logits_series = [tf.matmul(state, W2) + b2 for state in iterable_outputs] #Broadcasted addition
predictions_series = [tf.nn.softmax(logits) for logits in logits_series]
losses = [tf.losses.sparse_softmax_cross_entropy(labels, logits)
for logits, labels in zip(logits_series, labels_series)]
total_loss = tf.add_n(losses)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
x,y = generateData()
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
loss_list = []
_current_state = np.zeros((batch_size, 2*state_size))
#avevo genrateData fuori e -currentstate dentro
for epoch_idx in range(num_epochs):
print("New data, epoch", epoch_idx)
for batch_idx in range(num_batches):
start_idx = batch_idx * truncated_backprop_length
end_idx = start_idx + truncated_backprop_length
batchX = x[:,start_idx:end_idx,:]
batchY = y[:,start_idx:end_idx]
_total_loss, _train_step, _current_state, _predictions_series =
[total_loss, train_step, current_state, predictions_series],
perplexity = 2 ** (_total_loss/truncated_backprop_length )
_predictions_series = np.array(_predictions_series)
pr = _predictions_series.transpose([1, 0, 2])
pr_ind = []
for line in pr[0]:
for index in pr_ind:
print(decoder[index], end = " " )
print("\n learning rate: ", end = " ")
