Accuracy value goes up and down on the training process - machine-learning

After training the network I noticed that accuracy goes up and down. Initially I thought it is caused by the learning rate, but it is set to quite small value. Please check the screenshot attached.
Plot Accuracy Screenshot
My network (in Pytorch) looks as follow:
class Network(nn.Module):
def __init__(self):
super(Network,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
out = torch.sigmoid(out)
return out
I am using RMSprop as optimizer and BCELoss as criterion. The learning rate is set to 0.001
Here is the training process:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
if itr%p_itr == 0:
pred = torch.argmax(output, dim=1)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
My dataset is quite small - 2000 train and 1000 validation (binary classification 0/1). I wanted to do the 80/20 split but I was asked to keep it like that. I was thinking that the architecture might be too complex for such a small dataset.
Any hits what may cause such jumps in the training process?

Your code here is wrong: pred = torch.argmax(output, dim=1)
This line using for multiclass classification with Cross-Entropy Loss.
Your task is binary classification so the pred values are wrong. Change to:
if itr%p_itr == 0:
pred = torch.round(output)
....
You can change your optimizer to Adam, SGD, or RMSprop to find the suitable optimizer that helps your model coverage faster.
Also change the forward() function:
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return self.sigmoid(out) #use your forward is ok, but this cleaner

Related

validation loss is not changing at all (pytorch)

I'm using PyTorch for the first time to train my sentiment analysts model using Bert's pre-trained model.
this is my classifier
class SentimentClassifier2(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier2, self).__init__()
D_in, H, D_out = 768, 200, 3
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.4)
self.classifier = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
nn.Linear(H, D_out)
)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
output = self.drop(pooled_output)
logits = self.classifier(output)
return logits
this is my optimizer/loss function (I'm doing only 20 epochs cause it takes a while to train )
EPOCHS = 20
model2 = SentimentClassifier2(len(class_names))
model2= model2.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=True)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
training & evaluation code
def train_epoch( model, data_loader, loss_fn,optimizer, device, scheduler, n_examples):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
my problem: the loss for the validation samples is not changing at all !!!
epoch1:______________________
Train loss 1.0145157482929346 accuracy 0.4185746994848311
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch2:______________________
Train loss 1.015038197996413 accuracy 0.41871780194619346
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch3:______________________
Train loss 1.014710763787351 accuracy 0.4188609044075558
Val loss 1.002384223589083 accuracy 0.4151087371232354
epoch4:______________________
Train loss 1.0139196826735648 accuracy 0.41909940850982635
Val loss 1.002384223589083 accuracy 0.4151087371232354
I don't understand what the problem is ...!
I would be grateful if someone could help me ☹
Maybe you can try to it. following,
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.zero_grad()
optimizer.step()
scheduler.step()

Pytorch LSTM Prediction not learning

I'm using a LSTM model to predict BABA stock price using this dataset: "/kaggle/input/price-volume-data-for-all-us-stocks-etfs/Data/Stocks/baba.us.txt".
I'm not sure why my model is not learning and the y_test_prediction is so different from the actual y_test. I really appreciate your help as I'm beginning to learn machine learning. Thank you!
I have scaled the data with minMaxScaler before splitting it. This is how I split the data:
x_train, y_train, x_test, y_test = [], [], [], []
lags = 3
for t in range(len(train_data)-lags-1):
x_train.append(train_data[t:(t+lags),:])
y_train.append(train_data[(t+lags),:])
for t in range(len(test_data)-lags-1):
x_test.append(test_data[t:(t+lags),:])
y_test.append(test_data[(t+lags),:])
x_train = torch.FloatTensor(np.array(x_train))
y_train = torch.FloatTensor(np.array(y_train))
x_test = torch.FloatTensor(np.array(x_test))
y_test = torch.FloatTensor(np.array(y_test))
x_train = np.reshape(x_train,(x_train.shape[0],x_train.shape[1],1))
x_test = np.reshape(x_test,(x_test.shape[0],x_test.shape[1],1))
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
This is my LSTM model:
input_dim = 1
hidden_layer_dim = 32
num_layers = 1
output_dim = 1
class LSTM(nn.Module):
def __init__(self, input_dim,hidden_layer_dim, num_layers, output_dim ):
super(LSTM, self).__init__()
self.input_dim = input_dim
self.hidden_layer_dim = hidden_layer_dim
self.num_layers = num_layers
self.output_dim = output_dim
self.lstm = nn.LSTM(input_dim, hidden_layer_dim,num_layers,batch_first = True)
self.fc = nn.Linear(hidden_layer_dim, output_dim)
def forward(self, x):
# initial hidden state & cell state as zeros
h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_layer_dim))
c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_layer_dim))
# lstm output with hidden and cell state
output, (hn, cn) = self.lstm(x, (h0,c0))
# get hidden state to be passed to dense layer
hn = hn.view(-1, self.hidden_layer_dim)
output = self.fc(hn)
return output
This is my training:
num_epochs = 100
learning_rate = 0.01
model = LSTM(input_dim,hidden_layer_dim, num_layers, output_dim)
loss = torch.nn.MSELoss() # mean-squared error for regression
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
hist = np.zeros(num_epochs)
# train model
for epoch in range(num_epochs):
outputs = model(x_train)
optimizer.zero_grad()
#get loss function
loss_fn = loss(outputs, y_train.view(1,-1))
hist[epoch] = loss_fn.item()
loss_fn.backward()
optimizer.step()
if epoch %10==0:
print("Epoch: %d, loss: %1.5f" % (epoch, hist[epoch]))
This is the training loss and prediction vs actual
training loss
prediction vs actual
You are initialising hidden layers every time forward is being called, which might cause errors with backprop. You do not even have to initialise them. PyTorch takes care of that for you. You can check this implementation for the details. Also, as a side note, you might want to take a look at PyTorch dataloaders(just an easier way to make splits).

tgt and src have to have equal features for a Transformer Network in Pytorch

I am attempting to train EEG data through a transformer network. The input dimensions are 50x16684x60 (seq x batch x features) and the output is 16684x2. Right now I am simply trying to run a basic transformer, and I keep getting an error telling me
RuntimeError: the feature number of src and tgt must be equal to d_model
Why would the source and target feature number ever be equal? Is it possible to run such a dataset through a transformer?
Here is my basic model:
input_size = 60 # seq x batch x features
hidden_size = 32
num_classes = 2
learning_rate = 0.001
batch_size = 64
num_epochs = 2
sequence_length = 50
num_layers = 2
dropout = 0.5
class Transformer(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(Transformer, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.transformer = nn.Transformer(60, 2)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x, y):
# Forward Propogation
out, _ = self.transformer(x,y)
out = out.reshape(out.shape[0], -1)
out = self.fc(out)
return out
model = Transformer(input_size, hidden_size, num_layers, num_classes)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for index in tqdm(range(16684)):
X, y = (X_train[index], Y_train[index])
print(X.shape, y.shape)
output = model(X, y)
loss = criterion(output, y)
model.zero_grad()
loss.backward()
optimizer.step()
if index % 500 == 0:
print(f"Epoch {epoch}, Batch: {index}, Loss: {loss}")
You train the model to find some features by feeding it the input sequence and desired sequence. The backprop trains the net by computing the loss as a "difference" between src and target features.
If the features sizes aren't the same - the backprop can't find the accordance to some desired feature and the model can't be trained.

Why does the same PyTorch code (different implementation) give different loss?

I was tackling the Fashion MNIST data-set problem on Udacity. However my implementation of code is giving drastically different loss as compared to the solution shared by the Udacity team. I believe the only difference in my answer is the definition of the Neural Network and apart from that everything is the same. I am not able to figure out the reason for such a drastic difference in Loss.
Code 1: My solution:
import torch.nn as nn
from torch import optim
images, labels = next(iter(trainloader))
model = nn.Sequential(nn.Linear(784,256),
nn.ReLU(),
nn.Linear(256,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,10),
nn.LogSoftmax(dim=1))
# Flatten images
optimizer = optim.Adam(model.parameters(),lr=0.003)
criterion = nn.NLLLoss()
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 4000
Code 2: Official Solution:
from torch import nn, optim
import torch.nn.functional as F
class Classifier(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 64)
self.fc4 = nn.Linear(64, 10)
def forward(self, x):
# make sure input tensor is flattened
x = x.view(x.shape[0], -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.log_softmax(self.fc4(x), dim=1)
return x
model = Classifier()
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
epochs = 5
for e in range(epochs):
running_loss = 0
for images, labels in trainloader:
log_ps = model(images)
loss = criterion(log_ps, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
# Loss is coming around 200
Is there any explanation for the vast difference in loss ?
You forgot to zero out/clear the gradients in your implementation. That is you are missing :
optimizer.zero_grad()
In other words simply do:
for i in range(10):
running_loss = 0
for images,labels in trainloader:
images = images.view(images.shape[0], -1)
output = model.forward(images)
loss = criterion(output,labels)
# missed this!
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss}")
and you are good to go!

Tensorflow RNN stuck at high cost

The following RNN model decreases the loss for the first one or two epochs and then fluctuates around the cost of 6. This seems like the model is so random and not learning at all. I varied the learning rate from 0.1 to 0.0001 and it didn't help. The data is fed with an input pipeline, which worked fine with other models, so the functions that extract the label and images are not presented here. I have looked at this for so many times but still couldn't find what's wrong with it. Here's the code:
n_steps = 224
n_inputs = 224
learning_rate = 0.00015
batch_size = 256 # n_neurons
epochs = 100
num_batch = int(len(trainnames)/batch_size)
keep_prob = tf.placeholder(tf.float32)
# TRAIN QUEUE
train_queue = tf.RandomShuffleQueue(len(trainnames)*1.5, 0, [tf.string, tf.float32], shapes=[[],[num_labels,]])
enqueue_train = train_queue.enqueue_many([trainnames, train_label])
train_image, train_image_label = train_queue.dequeue()
train_image = read_image_file(train_image)
train_batch, train_label_batch = tf.train.batch(
[train_image, train_image_label],
batch_size=batch_size,
num_threads=1,
capacity=10*batch_size,
enqueue_many=False,
shapes=[[224,224], [num_labels,]],
allow_smaller_final_batch=True
)
train_close = train_queue.close()
def RNN(inputs, reuse):
with tf.variable_scope('cells', reuse=reuse):
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=batch_size, reuse=reuse)
with tf.variable_scope('rnn'):
outputs, states = tf.nn.dynamic_rnn(basic_cell, inputs, dtype=tf.float32)
fc_drop = tf.nn.dropout(states, keep_prob)
logits = tf.contrib.layers.fully_connected(fc_drop, num_labels, activation_fn=None)
return logits
#Training
with tf.name_scope("cost_function") as scope:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=train_label_batch, logits=RNN(train_batch, reuse=None)))
train_step = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost)
cost_summary = tf.summary.scalar("cost_function", cost)
file_writer = tf.summary.FileWriter(logdir)
#Session
with tf.Session() as sess:
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord, start=True)
step = 0
for epoch in range(epochs):
sess.run(enqueue_train)
for batch in range(num_batch):
if step % 100 == 0:
summary_str = cost_summary.eval(feed_dict={keep_prob: 1.0})
file_writer.add_summary(summary_str, step)
else:
sess.run(train_step, feed_dict={keep_prob: 0.5})
step += 1
sess.run(train_close)
coord.request_stop()
coord.join(threads)
file_writer.close()

Resources