Issue training RNN model with pytorch with trivial goal - machine-learning

I'm trying to train a simple RNN model with a trivial goal where the output matches a fixed vector regardless of the input
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
print "i2h WEIGHT size ", list(self.i2h.weight.size())
print "i2h bias size ", list(self.i2h.bias.size())
self.i2o = nn.Linear(hidden_size, output_size)
print "i2o WEIGHT size ", list(self.i2o.weight.size())
print "i2o bias size ", list(self.i2o.bias.size())
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined =, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(hidden)
output = self.softmax(output)
return output, hidden
def initHidden(self):
return Variable(torch.zeros(1, self.hidden_size))
n_hidden = 20
rnn = RNN(10, n_hidden, 3)
learning_rate = 1e-3
loss_fn = torch.nn.MSELoss(size_average=False)
out_target = Variable( torch.FloatTensor([[0.0 , 1.0, 0.0]] ) , requires_grad=False)
print "target output::: ", out_target
def train(category_tensor, line_tensor):
hidden = rnn.initHidden()
for i in range(line_tensor.size()[0]):
#print "train iteration ", i, ": input data: ", line_tensor[i]
output, hidden = rnn(line_tensor[i], hidden)
loss = loss_fn(output, out_target)
# Add parameters' gradients to their values, multiplied by learning rate
for p in rnn.parameters():
#print "parameter: ", p, " gradient: ",,
return output,[0]
current_loss = 0
n_iters = 500
for iter in range(1, n_iters + 1):
inp = Variable(torch.randn(100,1,10) + 5)
output, loss = train(out_target, inp)
current_loss += loss
if iter % 1 == 0:
print "weights: ",rnn.i2h.weight
print "LOSS: ", loss
print output
As it shows, the loss stays above 6 and never goes down. Notice also that I am biasing all the random inputs normal distributions by 5, so they are mostly positive numbers, so there should exist a weight distribution that approaches the goal output
What am I doing wrong in this example that is failing to output to approach the goal?

Your fixed output is:
torch.FloatTensor([[0.0, 1.0, 0.0]])
But you are using the following as the final layer in your RNN:
self.softmax = nn.LogSoftmax(dim=1)
Does LogSoftmax returns value in [0, 1]? Althouhgh, you can use the Softmax but I would recommend you to use the sign function and transform -1 to 0.


t() expects a tensor with <= 2 dimensions, but self is 3D

I'm new to pytorch and wrote a simple code as following to classify some inputs. The model input has 8*2 with batch size of 2 and the input layer in the model has 2 nodes. I don't know what is wrong!
trainset=, Y)
trainloader =, batch_size=BATCH_SIZE,
shuffle=True, num_workers=1)
from torch.nn.modules import flatten
learning_rate = 0.01
num_epochs = 20
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MyModel()
model =
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
## compute accuracy
def get_accuracy(logit, target, batch_size):
''' Obtain accuracy for training round '''
corrects = (torch.max(logit, 1)[1].view(target.size()).data ==
accuracy = 100.0 * corrects/batch_size
return accuracy.item()
model = MyModel()
# Commented out IPython magic to ensure Python compatibility.
for epoch in range(num_epochs):
train_running_loss = 0.0
train_acc = 0.0
## training step
for inputs, labels in trainloader:
#inputs =
#labels =
## forward + backprop + loss
outputs = model.forward(inputs)
loss = criterion(outputs, labels)
## update model params
train_running_loss += loss.detach().item()
train_acc += get_accuracy(outputs, labels, BATCH_SIZE)
print('Epoch: %d | Loss: %.4f | Train Accuracy: %.2f'%(epoch, train_running_loss / i, train_acc/i))
And my model is as below:
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = nn.Linear(2,3)
self.d2 = nn.Linear(3,1)
def init_weights(self):
def forward(self, x):
x = self.d1(x)
x = F.tanh(x)
x = self.d2(x)
out = F.sigmoid(x)
return out
Then I got an error:
RuntimeError Traceback (most recent call last)
<ipython-input-27-196d819d3ccd> in <module>
101 print(inputs)
--> 103 outputs = model.forward(inputs)
104 loss = criterion(outputs, labels)
2 frames
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/ in forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
116 def extra_repr(self) -> str:
RuntimeError: t() expects a tensor with <= 2 dimensions, but self is 3D
I flatten the input but nothing changed. What should I do to fix it?
First of all, you don't need to invoke your model's forward pass by model.forward(x); using model(x) is good.
Second of all, what exactly are you trying to achieve via the init_weights method? You're unsqueezing k1 and k2 twice, giving them the shape of (1, 1, x) which is 3D which is what the error is telling you. torch.nn.Linear performs a matrix multiplication with a 2D matrix, so you can't use a 3D one. torch.nn.Linear already initializes the weights via Kaiming initialization [1] so I'm not sure what you're trying to achieve here.
Changing the init_weights method to:
def init_weights(self):
k1 = torch.tensor([0.1, -0.72, 0.94, -0.29, 0.12, 0.44])
k1 = k1.reshape(self.d1.weight.shape) = k1
k2 = torch.tensor([1, -1.16, -0.26])
k2 = k2.reshape(self.d2.weight.shape) = k2
and changing the type of inputs from Long to Float (i.e., model(inputs.float())) should solve your problem.

LSTM giving same prediction for every sample

I am trying to train an LSTM network on a music dataset to create a model that can identify pitches. There are 45 different labels (pitches) that I'm trying to classify a given sample with. However, every label has the same prediction value no matter the sample (i.e. all labels #1 have the same value, all labels #2, ...)
Our X_train_RNN is 16170x20x2688 (each of our 16170 samples has a sequence length of 20), and our y_train_RNN is a 16170x45 (each of our 16170 samples have a 45-dimensional vector containing real numbers). We then say that a negative real number corresponds to that pitch not being there, and a positive real number corresponds to that pitch being there.
Here's the LSTM in PyTorch:
class LSTM1(nn.Module):
def __init__(self):
super(LSTM1, self).__init__()
self.num_classes = 45 #number of classes
self.num_layers = 1 #number of layers
self.input_size = 84*32 #input size
self.hidden_size = 100 #hidden state
self.seq_length = 20 #sequence length
self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size,
num_layers=self.num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(self.hidden_size, 128) #fully connected 1
self.fc = nn.Linear(128, self.num_classes) #fully connected last layer
self.relu = nn.ReLU()
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
out = self.relu(hn)
out = self.fc_1(out) #first Dense
out = self.relu(out) #relu
out = self.fc(out) #Final Output
return out
And here's how we use it:
from torch.nn.modules.loss import BCEWithLogitsLoss
epochs = 5
model = LSTM1()
criterion = BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-7)
train_loss = []
# training iteration
for epoch in range(epochs):
running_loss = 0
for itr, (image, label) in enumerate(trainloader):
#print("image: " + str(image.shape))
# zero gradient
# forward path
y_predicted = model(image)
#print("label: " + str(label.shape))
#print("y_predicted: " + str(y_predicted.shape))
loss = criterion(y_predicted, label)
running_loss += loss.item()
# backpropagating
# optimizes the weights
with torch.no_grad():
totalPos = 0
for i in range(45):
totalPos += np.count_nonzero(y_predicted.numpy()[:, i] > 0)
print("num of positive: " + str(totalPos))
print(f'epoch: {epoch+1}, loss: {running_loss:.4f}')
But afterwards when we get our model and get our predictions...
with torch.no_grad():
X = torch.tensor(X_test_RNN).float()
testPreds = model2(X)
testPreds has the same exact prediction for every value in a given column; no matter the sample. This even happens by the first or second epoch it seems. What could be going on?

PyTorch gives incorrect results due to broadcasting

I want to run some neural net experiments with PyTorch, but a minimal test case is giving wrong answers. The test case sets up a simple neural network with two input variables and an output variable that is just the sum of the inputs, and tries learning it as a regression problem; I expect it to converge on zero mean squared error, but it actually converges on 0.165. It's probably because of the issue alluded to in the warning message; how can I fix it?
import torch
import torch.nn as nn
# data
Xs = []
ys = []
n = 10
for i in range(n):
i1 = i / n
for j in range(n):
j1 = j / n
Xs.append([i1, j1])
ys.append(i1 + j1)
# torch tensors
X_tensor = torch.tensor(Xs)
y_tensor = torch.tensor(ys)
# hyperparameters
in_features = len(Xs[0])
hidden_size = 100
out_features = 1
epochs = 500
# model
class Net(nn.Module):
def __init__(self, hidden_size):
super(Net, self).__init__()
self.L0 = nn.Linear(in_features, hidden_size)
self.N0 = nn.ReLU()
self.L1 = nn.Linear(hidden_size, 1)
def forward(self, x):
x = self.L0(x)
x = self.N0(x)
x = self.L1(x)
return x
model = Net(hidden_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
# train
for epoch in range(1, epochs + 1):
# forward
output = model(X_tensor)
cost = criterion(output, y_tensor)
# backward
# print progress
if epoch % (epochs // 10) == 0:
print(f"{epoch:6d} {cost.item():10f}")
output = model(X_tensor)
cost = criterion(output, y_tensor)
print("mean squared error:", cost.item())
C:\Users\russe\Anaconda3\envs\torch2\lib\site-packages\torch\nn\modules\ UserWarning: Using a target size (torch.Size([100])) that is different to the input size (torch.Size([100, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
50 0.167574
100 0.165108
150 0.165070
200 0.165052
250 0.165039
300 0.165028
350 0.165020
400 0.165013
450 0.165009
500 0.165006
mean squared error: 0.1650056540966034
And the message:
UserWarning: Using a target size (torch.Size([100])) that is different to the input size (torch.Size([100, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
You're going to be a bit more specific on which tensors (X, or Y), but we can can reshape our tensors by using the torch.view function.
For example:
Y_tensor = torch.tensor(Ys)
>> torch.Size([5])
new_shape = (len(Ys), 1)
Y_tensor = Y_tensor.view(new_shape)
>> torch.Size([5, 1])
However, I'm skeptical that this broadcasting behavior is why you're having accuracy issues.

Use neural network to learn a square wave function

Out of curiosity, I am trying to build a simple fully connected NN using tensorflow to learn a square wave function such as the following one:
Therefore the input is a 1D array of x value (as the horizontal axis), and the output is a binary scalar value. I used tf.nn.sparse_softmax_cross_entropy_with_logits as loss function, and tf.nn.relu as activation. There are 3 hidden layers (100*100*100) and a single input node and output node. The input data are generated to match the above wave shape and therefore the data size is not a problem.
However, the trained model seems to fail completed, predicting for the negative class always.
So I am trying to figure out why this happened. Whether the NN configuration is suboptimal, or it is due to some mathematical flaw in NN beneath the surface (though I think NN should be able to imitate any function).
As per suggestions in the comment section, here is the full code. One thing I noticed saying wrong earlier is, there were actually 2 output nodes (due to 2 output classes):
See if neural net can find piecewise linear correlation in the data
import time
import os
import tensorflow as tf
import numpy as np
def generate_placeholder(batch_size):
x_placeholder = tf.placeholder(tf.float32, shape=(batch_size, 1))
y_placeholder = tf.placeholder(tf.float32, shape=(batch_size))
return x_placeholder, y_placeholder
def feed_placeholder(x, y, x_placeholder, y_placeholder, batch_size, loop):
x_selected = [[None]] * batch_size
y_selected = [None] * batch_size
for i in range(batch_size):
x_selected[i][0] = x[min(loop*batch_size, loop*batch_size % len(x)) + i, 0]
y_selected[i] = y[min(loop*batch_size, loop*batch_size % len(y)) + i]
feed_dict = {x_placeholder: x_selected,
y_placeholder: y_selected}
return feed_dict
def inference(input_x, H1_units, H2_units, H3_units):
with tf.name_scope('H1'):
weights = tf.Variable(tf.truncated_normal([1, H1_units], stddev=1.0/2), name='weights')
biases = tf.Variable(tf.zeros([H1_units]), name='biases')
a1 = tf.nn.relu(tf.matmul(input_x, weights) + biases)
with tf.name_scope('H2'):
weights = tf.Variable(tf.truncated_normal([H1_units, H2_units], stddev=1.0/H1_units), name='weights')
biases = tf.Variable(tf.zeros([H2_units]), name='biases')
a2 = tf.nn.relu(tf.matmul(a1, weights) + biases)
with tf.name_scope('H3'):
weights = tf.Variable(tf.truncated_normal([H2_units, H3_units], stddev=1.0/H2_units), name='weights')
biases = tf.Variable(tf.zeros([H3_units]), name='biases')
a3 = tf.nn.relu(tf.matmul(a2, weights) + biases)
with tf.name_scope('softmax_linear'):
weights = tf.Variable(tf.truncated_normal([H3_units, 2], stddev=1.0/np.sqrt(H3_units)), name='weights')
biases = tf.Variable(tf.zeros([2]), name='biases')
logits = tf.matmul(a3, weights) + biases
return logits
def loss(logits, labels):
labels = tf.to_int32(labels)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits, name='xentropy')
return tf.reduce_mean(cross_entropy, name='xentropy_mean')
def inspect_y(labels):
return tf.reduce_sum(tf.cast(labels, tf.int32))
def training(loss, learning_rate):
tf.summary.scalar('lost', loss)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
def evaluation(logits, labels):
labels = tf.to_int32(labels)
correct = tf.nn.in_top_k(logits, labels, 1)
return tf.reduce_sum(tf.cast(correct, tf.int32))
def run_training(x, y, batch_size):
with tf.Graph().as_default():
x_placeholder, y_placeholder = generate_placeholder(batch_size)
logits = inference(x_placeholder, 100, 100, 100)
Loss = loss(logits, y_placeholder)
y_sum = inspect_y(y_placeholder)
train_op = training(Loss, 0.01)
init = tf.global_variables_initializer()
sess = tf.Session()
max_steps = 10000
for step in range(max_steps):
start_time = time.time()
feed_dict = feed_placeholder(x, y, x_placeholder, y_placeholder, batch_size, step)
_, loss_val =[train_op, Loss], feed_dict = feed_dict)
duration = time.time() - start_time
if step % 100 == 0:
print('Step {}: loss = {:.2f} {:.3f}sec'.format(step, loss_val, duration))
x_test = np.array(range(1000)) * 0.001
x_test = np.reshape(x_test, (1000, 1))
_ =, feed_dict={x_placeholder: x_test})
print(min(_[:, 0]), max(_[:, 0]), min(_[:, 1]), max(_[:, 1]))
if __name__ == '__main__':
population = 10000
input_x = np.random.rand(population)
input_y = np.copy(input_x)
for bin in range(10):
print(bin, bin/10, 0.5 - 0.5*(-1)**bin)
input_y[input_x >= bin/10] = 0.5 - 0.5*(-1)**bin
batch_size = 1000
input_x = np.reshape(input_x, (population, 1))
run_training(input_x, input_y, batch_size)
Sample output shows that the model always prefer the first class over the second, as shown by min(_[:, 0]) > max(_[:, 1]), i.e. the minimum logit output for the first class is higher than the maximum logit output for the second class, for a sample size of population.
My mistake. The problem occurred in the line:
for i in range(batch_size):
x_selected[i][0] = x[min(loop*batch_size, loop*batch_size % len(x)) + i, 0]
y_selected[i] = y[min(loop*batch_size, loop*batch_size % len(y)) + i]
Python is mutating the whole list of x_selected to the same value. Now this code issue is resolved. The fix is:
x_selected = np.zeros((batch_size, 1))
y_selected = np.zeros((batch_size,))
for i in range(batch_size):
x_selected[i, 0] = x[(loop*batch_size + i) % x.shape[0], 0]
y_selected[i] = y[(loop*batch_size + i) % y.shape[0]]
After this fix, the model is showing more variation. It currently outputs class 0 for x <= 0.5 and class 1 for x > 0.5. But this is still far from ideal.
So after changing the network configuration to 100 nodes * 4 layers, after 1 million training steps (batch size = 100, sample size = 10 million), the model is performing very well showing only errors at the edges when y flips.
Therefore this question is closed.
You essentially try to learn a periodic function and the function is highly non-linear and non-smooth. So it is NOT simple as it looks like. In short, a better representation of the input feature helps.
Suppose your have a period T = 2, f(x) = f(x+2).
For a reduced problem when input/output are integers, your function is then f(x) = 1 if x is odd else -1. In this case, your problem would be reduced to this discussion in which we train a Neural Network to distinguish between odd and even numbers.
I guess the second bullet in that post should help (even for the general case when inputs are float numbers).
Try representing the numbers in binary using a fixed length precision.
In our reduced problem above, it's easy to see that the output is determined iff the least-significant bit is known.
decimal binary -> output
1: 0 0 1 -> 1
2: 0 1 0 -> -1
3: 0 1 1 -> 1
I created the model and the structure for the problem of recognizing odd/even numbers in here.
If you abstract the fact that:
decimal binary -> output
1: 0 0 1 -> 1
2: 0 1 0 -> -1
3: 0 1 1 -> 1
Is almost equivalent to:
decimal binary -> output
1: 0 0 1 -> 1
2: 0 1 0 -> 0
3: 0 1 1 -> 1
You may update the code to fit your need.

LSTM history length vs prediction error

I use LSTM to predict next step voltage value in voltage time series signal. I have a question:
Why using longer sequences (5 or 10 steps back in time) to train LSTM does not improve prediction and reduce prediction error ? (it actually degrades it - see the figures e.g. results for sequence_length=5 is better than sequence_length=10)
testplot('epochs: 10', 'ratio: 1', 'sequence_length: 10', 'mean error: ', '0.00116802704509')
testplot('epochs: 10', 'ratio: 1', 'sequence_length: 5', 'mean error: ', '0.000495359163296'
(predicted signal in green, real in red)
import os
import matplotlib.pyplot as plt
import numpy as np
import time
import csv
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
def data_power_consumption(path_to_dataset,
max_values = ratio * 2049280
with open(path_to_dataset) as f:
data = csv.reader(f, delimiter=",")
power = []
nb_of_values = 0
for line in data:
nb_of_values += 1
except ValueError:
# 2049280.0 is the total number of valid values, i.e. ratio = 1.0
if nb_of_values >= max_values:
print "max value", nb_of_values
print "Data loaded from csv. Formatting..."
result = []
for index in range(len(power) - sequence_length):
result.append(power[index: index + sequence_length])
result = np.array(result) # shape (2049230, 50)
result_mean = result.mean()
result -= result_mean
print "Shift : ", result_mean
print "Data : ", result.shape
row = round(0.9 * result.shape[0])
train = result[:row, :]
X_train = train[:, :-1]
y_train = train[:, -1]
X_test = result[row:, :-1]
y_test = result[row:, -1]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
return [X_train, y_train, X_test, y_test]
def build_model():
model = Sequential()
layers = [1, 50, 100, 1]
start = time.time()
model.compile(loss="mse", optimizer="adam") # consider adam
print "Compilation Time : ", time.time() - start
return model
def run_network(model=None, data=None):
global_start_time = time.time()
epochs = 10
ratio = 1
sequence_length = 3
path_to_dataset = 'TIMBER_DATA_1.csv'
if data is None:
print 'Loading data... '
X_train, y_train, X_test, y_test = data_power_consumption(
path_to_dataset, sequence_length, ratio)
X_train, y_train, X_test, y_test = data
print '\nData Loaded. Compiling...\n'
if model is None:
model = build_model()
X_train, y_train,
batch_size=512, nb_epoch=epochs, validation_split=0.05)
predicted = model.predict(X_test)
predicted = np.reshape(predicted, (predicted.size,))
print "done"
except KeyboardInterrupt:
print 'Training duration (s) : ', time.time() - global_start_time
return model, y_test, 0
fig, ax = plt.subplots()
txt = "epochs: " + str(epochs), "ratio: " + str(ratio), "sequence_length: " + str(sequence_length)
# calculate error (shift predicted by "sequence_length - 1 and apply mean with abs)
y_test_mean = y_test - np.mean(y_test)
y_test_mean_shifted = y_test_mean[:-1*(sequence_length - 1)]
predicted_mean = predicted - np.mean(predicted)
predicted_mean_shifted = predicted_mean[(sequence_length - 1):]
prediction_error = np.mean(abs(y_test_mean_shifted - predicted_mean_shifted))
text_mean = "mean error: ", str(prediction_error)
txt = txt + text_mean
# Now add the legend with some customizations.
legend = ax.legend(loc='upper center', shadow=True)
ax.plot(y_test_mean_shifted[900:1000], 'r--', label='Real data')
ax.plot(predicted_mean_shifted[900:1000], 'g:', label='Predicted')
fig.text(0.4, 0.2, txt, horizontalalignment='center', verticalalignment='center', transform = ax.transAxes)
plt.savefig(os.path.join('cern_figures', 'testplot' + str(txt) + '.png'))
except Exception as e:
print str(e)
print 'Training duration (s) : ', time.time() - global_start_time
return model, y_test, predicted
# main
if __name__ == "__main__":
_, y_test_out, predicted_out = run_network()
#y_test_out_mean = y_test_out - np.mean(y_test_out)
#predicted_out_mean = predicted_out - np.mean(predicted_out)
maybe because your time series at time t does not depend on your time series at time t-10. If you have a time series (x1,...,xn) and there is no link between xn and xn-p, there is no reason to use a step back of p.
For example if you want to predict the weather one hour ahead, you will not use a step back of 2 weeks. Why ? Because the weather of 2 weeks in the past has no influence on the weather right now. You will use instead the weather of the last hour (or last day).
Ps : I use this example of weather forcasting because there is to me no link between weather two weeks in the past and now. But maybe an expert in weather forcast would prove me wrong !
Cheers !
