I want to fix problem in PyTorch.
I wrote the following code that is learning sine functions as tutorial.
import torch
from torch import nn
from torch import optim
from torch.autograd import Variable as V
from import TensorDataset, DataLoader
import numpy as np
# y=sin(x1)
numTrain = 512
numTest = 128
noiseScale = 0.01
PI2 = 3.1415 * 2
X_train = np.random.rand(numTrain,1) * PI2
y_train = np.sin(X_train) + np.random.randn(numTrain,1) * noiseScale + 1.5
X_test = np.random.rand(numTest,1) * PI2
y_test = np.sin(X_test) + np.random.randn(numTest,1) * noiseScale
# Construct DataSet
X_trainT = torch.Tensor(X_train)
y_trainT = torch.Tensor(y_train)
X_testT = torch.Tensor(X_test)
y_testT = torch.Tensor(y_test)
ds_train = TensorDataset(X_trainT, y_trainT)
ds_test = TensorDataset(X_testT, y_testT)
# Construct DataLoader
loader_train = DataLoader(ds_train, batch_size=64, shuffle=True)
loader_test = DataLoader(ds_test, batch_size=64, shuffle=False)
# Construct network
net = nn.Sequential(
optimizer = optim.Adam(net.parameters())
loss_fn = nn.SmoothL1Loss()
# Training
losses = []
for epoc in range(100):
for data, target in loader_train:
y_pred = net(data)
loss = loss_fn(target,y_pred)
# evaluation
%matplotlib inline
from matplotlib import pyplot as plt
plt.scatter(X_train, y_train)
sinsX = []
sinsY = []
for t in range(128):
x = t/128 * PI2
output = net(V(torch.Tensor([x])))
Training is done without error, But the next line caused an error, "expected 2D or 3D input (got 1D input)"
output = net(V(torch.Tensor([x])))
This error doesn't occur if it is without BatchNorm1d().
I feel strange because the input is 1D.
How to fix it?
Update: How did I fix
arr = np.array([x])
output = net(V(torch.Tensor(arr[None,...])))

When working with 1D signals, pyTorch actually expects a 2D tensors: the first dimension is the "mini-batch" dimension. Therefore, you should evaluate your net on a batch with one 1D signal:
output - net(V(torch.Tensor([x[None, ...]]))
Make sure you set your net to "eval" mode before evaluating it:


Why is my pytorch classification model not learning?

I have created a simple pytorch classification model with sample datasets generated using sklearns make_classification. Even after training for thousands of epochs the accuracy of the model hovers between 30 and 40 percentage. During training itself the loss value is fluctuating very far and wide. I am wondering why this model is not learning, whether it's due to some logical error in the code.
import torch
from import Dataset, DataLoader
import torch.nn as nn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X,y = make_classification(n_features=15,n_classes=5,n_informative=4)
DEVICE = torch.device('cuda')
epochs = 5000
class CustomDataset(Dataset):
def __init__(self,X,y):
self.X = torch.from_numpy(X)
self.y = torch.from_numpy(y)
def __len__(self):
return len(self.X)
def __getitem__(self, index):
X = self.X[index]
y = self.y[index]
return (X,y)
class Model(nn.Module):
def __init__(self):
self.l1 = nn.Linear(15,10)
self.l2 = nn.Linear(10,5)
self.relu = nn.ReLU()
def forward(self,x):
x = self.l1(x)
x = self.relu(x)
x = self.l2(x)
x = self.relu(x)
return x
model = Model().double().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
train_data = CustomDataset(X_train,y_train)
test_data = CustomDataset(X_test,y_test)
trainloader = DataLoader(train_data, batch_size=32, shuffle=True)
testloader = DataLoader(test_data, batch_size=32, shuffle=True)
for i in range(epochs):
for (x,y) in trainloader:
x =
y =
output = model(x)
loss = loss_function(output,y)
if i%200==0:
print("epoch: ",i," Loss: ",loss.item())
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for x, y in testloader:
# calculate outputs by running x through the network
outputs = model(
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(, 1)
total += y.size(0)
correct += (predicted ==
print(f'Accuracy of the network on the test data: {100 * correct // total} %')
I tried to over-fit my model with only 10 samples (batch_size=5) X,y = make_classification(n_samples=10,n_features=15,n_classes=5,n_informative=4) but now the accuracy decreased to 15-20%. I then normalize the input data between the values 0 and 1 which pushed the accuracy a bit higher but not over 50 percentage. Any idea why this might be happening?
You should not be using ReLU activation on your output layer. Usually softmax activation is used for multi class classification on the final layer, or the logits are fed to the loss function directly without explicitly adding a softmax activation layer.
Try removing the ReLU activation from the final layer.

How to fix this loss is NaN problem in PyTorch of this RNN with GRU?

I'm completely new to PyTorch and tried out some models. I wanted to make an easy prediction rnn of stock market prices and found the following code:
I load the data set with pandas then split it into training and test data and load it into a pytorch DataLoader for later usage in training process. The model is defined in the GRU class. But the actual problem seems to be the optimisation. I think the problem could be gradient explosion. I thought about adding gradient clipping but the GRU design should actually prevent gradient explosion or am I wrong? What could cause the loss to be instantly NaN (already in the first epoch)
from sklearn.preprocessing import MinMaxScaler
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from import TensorDataset, DataLoader
batch_size = 200
input_dim = 1
hidden_dim = 32
num_layers = 2
output_dim = 1
num_epochs = 10
nvda = pd.read_csv('dataset/stocks/NVDA.csv')
price = nvda[['Close']]
scaler = MinMaxScaler(feature_range=(-1, 1))
price['Close'] = scaler.fit_transform(price['Close'].values.reshape(-1, 1))
def split_data(stock, lookback):
data_raw = stock.to_numpy() # convert to numpy array
data = []
# create all possible sequences of length seq_len
for index in range(len(data_raw) - lookback):
data.append(data_raw[index: index + lookback])
data = np.array(data)
test_set_size = int(np.round(0.2 * data.shape[0]))
train_set_size = data.shape[0] - (test_set_size)
x_train = data[:train_set_size, :-1, :]
y_train = data[:train_set_size, -1, :]
x_test = data[train_set_size:, :-1]
y_test = data[train_set_size:, -1, :]
return [x_train, y_train, x_test, y_test]
lookback = 20 # choose sequence length
x_train, y_train, x_test, y_test = split_data(price, lookback)
train_data = TensorDataset(torch.from_numpy(x_train).float(), torch.from_numpy(y_train).float())
train_data = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_data = TensorDataset(torch.from_numpy(x_test).float(), torch.from_numpy(y_test).float())
test_data = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)
class GRU(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
super(GRU, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
def forward(self, x, h):
out, h = self.gru(x, h)
out = self.fc(self.relu(out[:, -1]))
return out, h
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden =, batch_size, self.hidden_dim).zero_()
return hidden
model = GRU(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0000000001)
start_time = time.time()
h = model.init_hidden(batch_size)
for epoch in range(1, num_epochs+1):
for x, y in train_data:
h =
y_train_pred, h = model(x, h)
loss = criterion(y_train_pred, y)
print("Epoch ", epoch, "MSE: ", loss.item())
training_time = time.time() - start_time
print("Training time: {}".format(training_time))
This is the dataset which I used.
Not sure if it is the case, but did you preprocess and cleaned the data? I do not know it but maybe there are some values missing or it's something strange about it. I checked it here and it seems that every couple of rows there is some inconsistency. Like I said, I do not know if it's the case but it may be.

Test set accuracy is very high after very few epochs on mnist dataset

With very few epochs this model learns to classify beween 1 and 0 extremely quickly which leads me to consider something is wrong.
Below code downloads mnist dataset, extracts the mnist images that contain 1 or 0 only. A random sample of size 200 is selected from this subset of mnist images. This random sample is the dataset the model is trained on. With just 2 epochs the model achieves 90%+ test set accuracy, is this expected behaviour ? I expected many more epochs would be required in order to train the model to achieve this level of test set accuracy.
Model code :
%reset -f
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
root = './data'
if not os.path.exists(root):
train_set = dset.MNIST(root=root, train=True, transform=trans, download=True)
test_set = dset.MNIST(root=root, train=False, transform=trans, download=True)
batch_size = 64
train_loader =
test_loader =
class NeuralNet(nn.Module):
def __init__(self):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(28*28, 500)
self.fc2 = nn.Linear(500, 256)
self.fc3 = nn.Linear(256, 2)
def forward(self, x):
x = x.view(-1, 28*28)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
num_epochs = 2
random_sample_size = 200
values_0_or_1 = [t for t in train_set if (int(t[1]) == 0 or int(t[1]) == 1)]
values_0_or_1_testset = [t for t in test_set if (int(t[1]) == 0 or int(t[1]) == 1)]
train_loader_subset =
test_loader_subset =
train_loader = train_loader_subset
# Hyper-parameters
input_size = 100
hidden_size = 100
num_classes = 2
# learning_rate = 0.00001
learning_rate = .0001
# Device configuration
device = 'cpu'
print_progress_every_n_epochs = 1
model = NeuralNet().to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
N = len(train_loader)
# Train the model
total_step = len(train_loader)
most_recent_prediction = []
test_actual_predicted_dict = {}
rm = random.sample(list(values_0_or_1), random_sample_size)
train_loader_subset = data_utils.DataLoader(rm, batch_size=4)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader_subset):
# Move tensors to the configured device
images = images.reshape(-1, 2).to(device)
labels =
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
if (epoch) % print_progress_every_n_epochs == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
predicted_test = []
model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
probs_l = []
predicted_values = []
actual_values = []
labels_l = []
with torch.no_grad():
for images, labels in test_loader_subset:
images =
labels =
outputs = model(images)
_, predicted = torch.max(, 1)
sm = torch.nn.Softmax()
probabilities = sm(outputs)
if (epoch) % 1 == 0:
print('test accuracy : ', 100 * len((np.where(np.array(predicted_values[0])==(np.array(actual_values[0])))[0])) / len(actual_values[0]))
Output of model (12665 & 2115 represents the training and test set sizes) :
Epoch [1/2], Step [50/198], Loss: 0.1256
Epoch [2/2], Step [50/198], Loss: 0.0151
test accuracy : 99.76359338061465
/anaconda3/envs/pytorch/lib/python3.7/site-packages/ UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Here's my 2 cents on your binary experiment.
It would seem like you have severely reduce the complexity of your dataset and with the high number of neurons in your intermediate layers, your model is expected to converge very quickly.
Note that MNIST dataset has channel of 1 and this makes the task very simple.
You may try to play with CIFAR10 and see if you are still getting high accuracy in just 2 epochs.
That's not a particularly well-posed question, because what is expected is entirely subjective. That being said, I am not surprised because 0 and 1 are very different digits. For instance, 0 has background surrounded by foreground, whereas 1 does not - that's an almost infallible test to distinguish the two. As a sanity check, I would swap out 0 for 7, which is similar to 1. I would expect to see significantly lower success rate. That being said, that's a sanity check - even if it passes, there may still be bugs or errors in your method.

Problems with PyTorch MLP when training the MNIST dataset retrieved from Keras

I have finished a PyTorch MLP model for the MNIST dataset, but got two different results: 0.90+ accuracy when using MNIST dataset from PyTorch, but ~0.10 accuracy when using MNIST dataset from Keras.
Below is my code with dependency: PyTorch 0.3.0.post4, keras 2.1.3, tensorflow backend 1.4.1 gpu version.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import torch as pt
import torchvision as ptv
from keras.datasets import mnist
from torch.nn import functional as F
from import Dataset, DataLoader
# training data from PyTorch
train_set = ptv.datasets.MNIST("./data/mnist/train", train=True, transform=ptv.transforms.ToTensor(), download=True)
test_set = ptv.datasets.MNIST("./data/mnist/test", train=False, transform=ptv.transforms.ToTensor(), download=True)
train_dataset = DataLoader(train_set, batch_size=100, shuffle=True)
test_dataset = DataLoader(test_set, batch_size=10000, shuffle=True)
class MLP(pt.nn.Module):
"""The Multi-layer perceptron"""
def __init__(self):
super(MLP, self).__init__()
self.fc1 = pt.nn.Linear(784, 512)
self.fc2 = pt.nn.Linear(512, 128)
self.fc3 = pt.nn.Linear(128, 10)
self.use_gpu = True
def forward(self, din):
din = din.view(-1, 28 * 28)
dout = F.relu(self.fc1(din))
dout = F.relu(self.fc2(dout))
# return F.softmax(self.fc3(dout))
return self.fc3(dout)
model = MLP().cuda()
# loss func and optim
optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()
def evaluate_acc(pred, label):
pred = pred.cpu().data.numpy()
label = label.cpu().data.numpy()
test_np = (np.argmax(pred, 1) == label)
test_np = np.float32(test_np)
return np.mean(test_np)
def evaluate_loader(loader):
print("evaluating ...")
accurarcy_list = []
for i, (inputs, labels) in enumerate(loader):
inputs = pt.autograd.Variable(inputs).cuda()
labels = pt.autograd.Variable(labels).cuda()
outputs = model(inputs)
accurarcy_list.append(evaluate_acc(outputs, labels))
print(sum(accurarcy_list) / len(accurarcy_list))
def training(d, epochs):
for x in range(epochs):
for i, data in enumerate(d):
(inputs, labels) = data
inputs = pt.autograd.Variable(inputs).cuda()
labels = pt.autograd.Variable(labels).cuda()
outputs = model(inputs)
loss = criterion(outputs, labels)
if i % 200 == 0:
print(i, ":", evaluate_acc(outputs, labels))
# Training MLP for 4 epochs with MNIST dataset from PyTorch
training(train_dataset, 4)
# The accuracy is ~0.96.
def load_mnist():
(x, y), (x_test, y_test) = mnist.load_data()
x = x.reshape((-1, 1, 28, 28)).astype(np.float32)
x_test = x_test.reshape((-1, 1, 28, 28)).astype(np.float32)
y = y.astype(np.int64)
y_test = y_test.astype(np.int64)
print("x.shape", x.shape, "y.shape", y.shape,
"\nx_test.shape", x_test.shape, "y_test.shape", y_test.shape,
return x, y, x_test, y_test
class TMPDataset(Dataset):
"""Dateset for loading Keras MNIST dataset."""
def __init__(self, a, b):
self.x = a
self.y = b
def __getitem__(self, item):
return self.x[item], self.y[item]
def __len__(self):
return len(self.y)
x_train, y_train, x_test, y_test = load_mnist()
# Create dataloader for MNIST dataset from Keras.
test_loader = DataLoader(TMPDataset(x_test, y_test), num_workers=1, batch_size=10000)
train_loader = DataLoader(TMPDataset(x_train, y_train), shuffle=True, batch_size=100)
# Evaluate the performance of MLP trained on PyTorch dataset and the accurach is ~0.96.
model = MLP().cuda()
optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()
# Train now on MNIST dataset from Keras.
training(train_loader, 4)
# Evaluate the trianed model on MNIST dataset from Keras and result in performance ~0.10...
I had checked some samples from Keras MNIST dataset and found no error.
I am wondering what is wrong with the datasets?
The code can run without error, run it to see the results.
The MNIST data coming from Keras are not normalized; following the Keras MNIST MLP example, you should do it manually, i.e. you should include the following in your load_data() function:
x /= 255
x_test /= 255
Not sure about PyTorch, but it would seem that the MNIST data from their own utility functions come already normalized (as is the case with Tensorflow - see the third point in my answer here).
A 10% accuracy (i.e. equivalent to random guessing) in case of not-normalized input data is perfectly consistent.

Keras save model issue

This is a variational autoencoder network, I have to define a sampling method to generate latent z, I thinks it might be something wrong with this. This py file is doing training, the other py file is doing predicting online, so I need to save the keras model, there is nothing wrong with saving model, but when I load model from 'h5' file, it shows an error:
NameError: name 'latent_dim' is not defined
The following is code:
df_test = df[df['label']==cluster_num].iloc[:,:data_num.shape[1]]
data_scale_ = preprocessing.StandardScaler().fit(df_test.values)
data_num_ = data_scale.transform(df_test.values)
batch_size = data_num_.shape[0]//10
original_dim = data_num_.shape[1]
latent_dim = data_num_.shape[1]*2
intermediate_dim = data_num_.shape[1]*10
nb_epoch = 1
epsilon_std = 0.001
x = Input(shape=(original_dim,))
init_drop = Dropout(0.2, input_shape=(original_dim,))(x)
h = Dense(intermediate_dim, activation='relu')(init_drop)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)
def sampling(args):
z_mean, z_log_var = args
epsilon = K.random_normal(shape=(latent_dim,), mean=0.,
return z_mean + K.exp(z_log_var / 2) * epsilon
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
# we instantiate these layers separately so as to reuse them later
decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='linear')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)
def vae_loss(x, x_decoded_mean):
xent_loss = original_dim * objectives.mae(x, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
return xent_loss + kl_loss
vae = Model(x, x_decoded_mean)
vae.compile(optimizer=Adam(lr=0.01), loss=vae_loss)
train_ratio = 0.95
train_num = int(data_num_.shape[0]*train_ratio)
x_train = data_num_[:train_num,:]
x_test = data_num_[train_num:,:], x_train,
validation_data=(x_test, x_test))'./models/deep_learning_'+str(cluster_num)+'.h5')
del vae
from keras.models import load_model
vae = load_model('./models/deep_learning_'+str(cluster_num)+'.h5')
It shows error:
NameError: name 'latent_dim' is not defined
For variational loss you are using many variable not known by Keras module. You need to pass them through custom_objects param of load_model function.
In your case:'./vae_'+str(cluster_num)+'.h5')
del vae
from keras.models import load_model
vae = load_model('./vae_'+str(cluster_num)+'.h5', custom_objects={'latent_dim': latent_dim, 'epsilon_std': epsilon_std, 'vae_loss': vae_loss})
If you load model (.h5) file in your new py file, you can use load_model('/.h5', compile = False).
Because you do not need to any custom objects (i.e loss function or latent_dim, etc) in prediction step.
