How to deal with big dataset when using pyG? - pytorch-geometric

I am a beginer learning to using torch_geometric to build my GNN models. I refered the sample of the pyG example of node classification and build my own dataset, however, I tried to use my GPU to run the code and it tells me that it run out of memory, maybe my dataset is too large to allocate the GPU memory? I don't know. I shared an machine of 8 A100 with my classmates. Could you please give me some suggestions, thank you!
from torch_geometric.nn import GATConv,GCNConv
from torch_geometric.data import Dataset,DataLoader,HeteroData,Data
import torch.nn as nn
from torch_geometric.nn import DataParallel
from torch_geometric.loader import DataListLoader
import torch.nn.functional as F
import torch
import pandas as pd
from transformers import BertTokenizer,BertModel
import pickle
import time
from tqdm import tqdm
from numba import jit
import json
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
plt.grid(True)
plt.grid(color='gray',
linestyle='--',
linewidth=1,
alpha=0.3)
begin = time.time()
punctuation = "!#$%&'\(\)-*+,-./:;<=>?#\\\[\]^_`{|}~():;,。【】·、“”‘’《》\"%……——·"
def dataCleanifier(s):
for i in punctuation:
s.replace(i," ")
s = s.replace(" "," ")
s = s.replace("\n","")
return s
class BertClassifier(nn.Module):
def __init__(self,bertType:str,max_length,tag_size):
super(BertClassifier,self).__init__()
self.bertType = bertType
self.tokenizer = BertTokenizer.from_pretrained(self.bertType)
self.encoder = BertModel.from_pretrained(self.bertType)
self.outputDim = self.encoder.pooler.dense.out_features
self.max_length = max_length
self.tag_size = tag_size
self.dropout = nn.Dropout(0.1)
self.activation = nn.LeakyReLU(0.1)
self.convs = nn.ModuleList(
[nn.Conv2d(1, 512, (k, self.outputDim)) for k in (2,3,4)])
self.fc_cnn = nn.Linear(512 * len((2,3,4)), self.tag_size)
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self,x):
x = self.tokenizer.batch_encode_plus(x,return_tensors="pt",max_length=self.max_length,truncation=True,padding="max_length")
attention = x["attention_mask"]
x = x["input_ids"]
x = x.cuda(2)
x = self.encoder(x,attention_mask=attention.cuda(2))['last_hidden_state'][:]
x = x.unsqueeze(1)
encoded = torch.cat([self.conv_and_pool(x,conv) for conv in self.convs],1)
x = self.fc_cnn(encoded)
x = self.activation(x)
# x = F.softmax(x,dim=1)
return x,encoded
class ContrastiveLoss(nn.Module):
def __init__(self):
super(ContrastiveLoss, self).__init__()
def forward(self,representations,label,y_hat):
n = label.shape[0]
T = 0.5
similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
mask = torch.ones_like(similarity_matrix) * (label.expand(n, n).eq(label.expand(n, n).t()))
mask_no_sim = torch.ones_like(mask) - mask
mask_dui_jiao_0 = torch.ones(n ,n) - torch.eye(n, n )
similarity_matrix = torch.exp(similarity_matrix/T)
similarity_matrix = similarity_matrix*mask_dui_jiao_0
sim = mask*similarity_matrix
no_sim = similarity_matrix - sim
no_sim_sum = torch.sum(no_sim , dim=1)
no_sim_sum_expend = no_sim_sum.repeat(n, 1).T
sim_sum = sim + no_sim_sum_expend
loss = torch.div(sim , sim_sum)
loss = mask_no_sim + loss + torch.eye(n, n )
#接下来就是算一个批次中的loss了
loss = -torch.log(loss) #求-log
loss = torch.sum(torch.sum(loss, dim=1) )/(2*n)+nn.CrossEntropyLoss()(y_hat,label)
return loss
class GAT(nn.Module):
def __init__(self, hidden_channels) -> None:
super().__init__()
self.conv1 = GATConv(data.num_features,hidden_channels)
self.conv2 = GATConv(hidden_channels,9)
self.activation = nn.ReLU()
def forward(self,x,edge_index):
x = self.conv1(x,edge_index)
x = self.activation(x)
# print(x)
# x = F.dropout(x,p=0.2)
x = self.conv2(x,edge_index)
return x
x=None
y=None
edge_index = None
train_mask = None
with open("X.pkl","rb") as f1:
x = pickle.load(f1)
with open("Y.pkl","rb") as f2:
y = pickle.load(f2)
y = y.long()
with open("edge_index.pkl","rb") as f3:
edge_index = pickle.load(f3)
# print(edge_index.shape)
with open("train_mask.pkl","rb") as f4:
train_mask = pickle.load(f4)
data = Data(x=x,y=y,edge_index=edge_index)
data.train_mask = train_mask
model = GAT(hidden_channels=32)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = lr_scheduler.StepLR(optimizer, 100, 0.8)
criterion = ContrastiveLoss()
def train():
model.train()
optimizer.zero_grad() # Clear gradients.
out = model(data.x,data.edge_index) # Perform a single forward pass.
loss = criterion(data.x[data.train_mask], data.y[data.train_mask],out[data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
return loss
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1) # Use the class with highest probability.
test_correct = pred[data.train_mask] == data.y[data.train_mask] # Check against ground-truth labels.
test_acc = int(test_correct.sum()) / int(data.train_mask.sum()) # Derive ratio of correct predictions.
return test_acc
accs = []
for epoch in range(1, 1025):
loss = train()
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}',end=" ")
acc = test()
print("acc:",acc)
accs.append(acc)
scheduler.step()
plt.plot(range(len(accs)),accs)
print(time.time()-begin)
with open("./accs_gat_GCL.pkl","wb") as f1:
pickle.dump(accs,f1)
plt.savefig("./res_GAT_GCL.png",dpi=600)
I have tried to use DataPararel to use multiple GPU to load my model and dataset but failed.

Related

How to use variable learning rate that decreases with loss using pytorch-geometric?

I have the following code snippet from PyTorch geometric example. I want to use a learning rate that decreases as the loss value during training decreases. I tried using scheduler but that didn't work for me.
A clean code-snippet is below. Can anyone provide valuable suggestions or help on this matter?
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import AGNNConv
dataset = 'Cora'
path = 'Cora'
dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lin1 = torch.nn.Linear(dataset.num_features, 16)
self.prop1 = AGNNConv(requires_grad=False)
self.prop2 = AGNNConv(requires_grad=True)
self.lin2 = torch.nn.Linear(16, dataset.num_classes)
def forward(self):
x = F.dropout(data.x, training=self.training)
x = F.relu(self.lin1(x))
x = self.prop1(x, data.edge_index)
x = self.prop2(x, data.edge_index)
x = F.dropout(x, training=self.training)
x = self.lin2(x)
return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
optimizer.step()
def test():
model.eval()
logits, accs = model(), []
for _, mask in data('train_mask', 'val_mask', 'test_mask'):
pred = logits[mask].max(1)[1]
acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
accs.append(acc)
return accs
best_val_acc = test_acc = 0
for epoch in range(1, 201):
train()
train_acc, val_acc, tmp_test_acc = test()
if val_acc > best_val_acc:
best_val_acc = val_acc
test_acc = tmp_test_acc
log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
print(log.format(epoch, train_acc, best_val_acc, test_acc))

How can I avoid underfitting in Pytorch NeuralNetwork

How can I avoid underfitting in Pytorch NeuralNetwork?
I try to predict the power consumtion of a plant based on seven features. I have built two simple neural network models.
The first one is a Linear model, and the second is a RNN model. However, both models perform bad in the test set, their forecast result is a straight line.
Something about data
There are about 360 samples in the CSV file. I take the first 300 samples for trainning and the others for test. The first 7 columns of raw data are features of daily operation. The last column is the electricity consumption of every day.
Setting of training set
In the linear model, train data is the first 7 colums of a certain day, and corresponding target is the power consumption of that day.
In the RNN model, train data is all the 8 columns of three days(seven features and power consumption), and corresponding traget is the power consumption of next three days.
Code
Code of RNN model
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
'''
build simple RNN
'''
batchSize = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
netPath = ''
'''Data processing'''
# read raw data
filePath = 'F:/.csv'
initialData = pd.read_csv(filePath)
print(initialData.head(10))
print('hello world')
# Separate features and power consumption.
trainDatas = initialData.iloc[0:7, 1:301]
trainPowerConsum = pd.DataFrame(initialData.iloc[-1, 1:301]).T
trainDatas = pd.concat([trainDatas, trainPowerConsum], 0)
trainPowerConsum = initialData.iloc[-1, 2:302]
# Plot
powerConsumPlot = trainDatas.iloc[-1, :]
xData = np.linspace(1, powerConsumPlot.shape[0], 300)
plt.plot(xData, powerConsumPlot)
plt.show()
testDatas = initialData.iloc[0:7, 302:-1]
testPowerConsum = pd.DataFrame(initialData.iloc[-1, 302:-1]).T
testDatas = pd.concat([testDatas, testPowerConsum], 0)
testPowerConsum = initialData.iloc[-1, 303:]
# convert to dataframe
trainDatas = pd.DataFrame(trainDatas)
trainDatas = trainDatas.T
trainPowerConsum = pd.DataFrame(trainPowerConsum)
testDatas = pd.DataFrame(testDatas)
testDatas = testDatas.T
testPowerConsum = pd.DataFrame(testPowerConsum)
# change the unit of PowerConsumption
trainDatas.iloc[:, -1] = trainDatas.iloc[:, -1] * 1000
testDatas.iloc[:, -1] = testDatas.iloc[:, -1] * 1000
trainPowerConsum.iloc[:, 0] = trainPowerConsum.iloc[:, 0] * 1000
testPowerConsum.iloc[:, 0] = testPowerConsum.iloc[:, 0] * 1000
assert testPowerConsum.shape[0] == testDatas.shape[0]
assert trainDatas.shape[0] == trainPowerConsum.shape[0]
# convert dataframe to tensor
trainDatas = torch.tensor(trainDatas.values.astype(float), device=device)
trainPowerConsum = torch.tensor(trainPowerConsum.values.astype(float), device=device)
testDatas = torch.tensor(testDatas.values.astype(float), device=device)
testPowerConsum = torch.tensor(testPowerConsum.values.astype(float), device=device)
trainDatasList = list()
trainPowerConsumList = list()
for i in range(298):
trainDatasList.append(trainDatas[i:i + 3])
trainPowerConsumList.append(trainPowerConsum[i:i + 3])
from torch.nn.utils.rnn import pad_sequence
trainPowerConsum = pad_sequence(trainPowerConsumList, batch_first=True)
trainDatas = pad_sequence(trainDatasList, batch_first=True)
print(trainDatas.shape)
# ensure the batch_size of test data is 1
testDatas = torch.unsqueeze(testDatas, dim=0)
testPowerConsum = torch.unsqueeze(testPowerConsum, dim=0)
'''build dataloader'''
trainDataLoader = DataLoader(
TensorDataset(
trainDatas, trainPowerConsum
),
shuffle=True, batch_size=batchSize, drop_last=True)
print('Data is ready')
seqLen = 2
inputDim = 8
hiddenSize = 3
numLayer = 2
learningRate = 0.01
class RNNModel(torch.nn.Module):
def __init__(self, inputsize, hiddensize, batchsize, numLayer):
super(RNNModel, self).__init__()
self.batchsize = batchsize
self.inputsize = inputsize
self.hiddensize = hiddensize
self.numlayers = numLayer
self.rnn = torch.nn.RNN(input_size=self.inputsize, hidden_size=self.hiddensize, num_layers=self.numlayers,
batch_first=True)
self.l1 = torch.nn.Linear(hiddenSize, hiddensize)
self.l2 = torch.nn.Linear(hiddenSize, 1)
def forward(self, input, hidden):
out, hidden = self.rnn(input.float(), hidden.float())
batch_size, seq_len, input_dim = out.shape
out = out.reshape(-1, input_dim)
# out = f.sigmoid(self.l1(out))
out = f.relu(self.l1(out))
out = self.l2(out)
out = out.reshape(batch_size, seq_len, -1)
return out, hidden
def initHidden(self):
hidden = torch.zeros(self.numlayers, self.batchsize, self.hiddensize, device=device, dtype=torch.float64)
return hidden
net = RNNModel(inputDim, hiddenSize, batchSize, numLayer).to(device)
criterion = torch.nn.L1Loss()
optimizer = optim.Adam(net.parameters(), lr=learningRate,momentum=0.01)
def train(epoch):
runLoss = 0.
optimizer.zero_grad()
hidden = net.initHidden()
for batchIndex, data in enumerate(trainDataLoader, 0):
inputs, target = data
optimizer.zero_grad()
outputs, hidden = net(inputs, hidden)
hidden = hidden.detach()
loss = criterion(outputs.float(), target.float())
loss = loss.mean()
loss.backward()
optimizer.step()
print(f'{epoch + 1},\t Loss={loss.item()}')
# torch.save(net.state_dict(), netPath)
def test():
testDatasVice = torch.clone(testDatas)
input = testDatasVice[:, 0, :]
input = input.view(1, 1, -1)
assert input.shape[2] == 8
predictPowConsum = list()
# the first hidden tensor in test set is zero
hidden = torch.zeros(2, 1, 3, device=device, dtype=torch.float64)
with torch.no_grad():
for i in range(testDatas.shape[1]):
output, hidden = net(input, hidden)
if i < 51:
testDatasVice[:, i + 1, -1] = output[0]
input = torch.unsqueeze(testDatasVice[:, i + 1, :], dim=0)
predictPowConsum.append(output.data.cpu().numpy().ravel()[0])
elif i == 51:
predictPowConsum.append(output.data.cpu().numpy().ravel()[0])
else:
print('\tindexError') # Exclude potential Errors
return predictPowConsum
if __name__ == '__main__':
epochNum = 300
for epoch in range(epochNum):
train(epoch)
predictPowConsum = test()
# plotting
xData = np.arange(303, 303 + testPowerConsum.size(1))
plt.plot(xData, testPowerConsum.cpu().numpy()[0, :, 0])
plt.plot(xData, predictPowConsum)
plt.show()
Code of Linear model
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
filePath = 'F:.csv'
initialData = pd.read_csv(filePath)
print(initialData.head(10))
print('hello world')
trainDatas = initialData.iloc[0:7, 1:300]
trainPowerConsum = initialData.iloc[-1, 1:300]
testDatas = initialData.iloc[0:7, 300:-1]
testPowerConsum = initialData.iloc[-1, 300:-1]
trainDatas = pd.DataFrame(trainDatas)
trainDatas = trainDatas.T
trainPowerConsum = pd.DataFrame(trainPowerConsum)
testDatas = pd.DataFrame(testDatas)
testDatas = testDatas.T
testPowerConsum = pd.DataFrame(testPowerConsum)
trainPowerConsum.iloc[:, 0] = trainPowerConsum.iloc[:, 0] * 1000
testPowerConsum.iloc[:, 0] = testPowerConsum.iloc[:, 0] * 1000
# build dataloader
trainData = DataLoader(
TensorDataset(
torch.tensor(trainDatas.values).float(),
torch.tensor(trainPowerConsum.values.astype(float)).float()
),
shuffle=True, batch_size=15)
testData = DataLoader(
TensorDataset(
torch.tensor(testDatas.values.astype(float)).float(),
torch.tensor(testPowerConsum.values.astype(float)).float()
),
shuffle=False, batch_size=15)
print('data is ready')
class SimpleNet(torch.nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
self.l1 = torch.nn.Linear(7, 15)
self.l2 = torch.nn.Linear(15, 30)
self.l3 = torch.nn.Linear(30, 15)
self.l4 = torch.nn.Linear(15, 5)
self.l5 = torch.nn.Linear(5, 1)
def forward(self, x):
x = f.relu(self.l1(x))
x = f.relu(self.l2(x))
x = f.relu(self.l3(x))
x = f.relu(self.l4(x))
return self.l5(x)
model = SimpleNet()
criterion = torch.nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)
def train(epoch):
runLoss = 0.
for batch_index, data in enumerate(trainData, 0):
inputs, target = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
runLoss += loss
print(f'{epoch + 1},{batch_index + 1},\tLoss={runLoss / 5}')
runLoss = 0
def test(epoch):
totalError = 0.
print('Start to test the model')
with torch.no_grad():
for data in testData:
# test ---------data for test
# testlab ---------corresponding power consumption
test, testlab = data
outputs = model(test)
predicted = outputs.data
testError = testlab - predicted
# plotting
if epoch % 50 == 2:
xData = np.linspace(1, 15, 15)
if predicted.size(0) != 15:
pass
else:
plt.plot(xData, predicted[:, 0].numpy(), label='predicted', color='red')
plt.plot(xData, testlab[:, 0].numpy(), label='origData', color='blue')
plt.show()
totalError += (torch.abs(testError).sum().item())
print(f'Average Error on test set is {totalError / 54}')
if __name__ == '__main__':
for epoch in range(1000):
train(epoch)
test(epoch)
Image of Output
output of RNN
The blue line is the actual data, and the orange line is the output of RNN model.
Solutions and its Effect
I have looked around and apparently I've got the choice between these solutions:
Add new domain-specific features
Decrease the amount of regularization used
Increase the duration of training
Increase the complexity or type of the model
Decrease the learning rate
Try other activate function
I have tried some solutions:
The data for trainning isn't regularized. I just change the unit of electricity from kWh to Wh
I take ReLu as activate function after using Sigmoid, but it doesn't work
I adjust the learning rate from 0.01 to 0.001, it doesn't improve
I try different optimizer such as SGD and Adam on both model, even use momentum, it doesn't get better
The sequence length of RNN model is 60 firstly, then is set to 3. The loss dropped more rapidly in the latter case, but the forecast result still is a straight line
In a word, all solutions I find doesn't work.
Besides, if shuffle is True when building DataLoader, the loss skips violently between epochs. But it drops slowly and close to an constant eventually when shuffle is False.
What could be the best way to avoid the problem?
Thanks in advance!

Multi GPU training in Pytorch causes my system to freeze

When I wrap my model in nn.DataParallel(model) and start training my screen freezes and I have to manually restart the computer every time.
I've tried a few variations, like not adding .to(device) to every x and y, but whenever nn.DataParallel is used I seem to cause the computer to freeze.
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models, datasets, transforms
import torch.utils.data
DataLoader = torch.utils.data.DataLoader
random_split = torch.utils.data.random_split
global_rank = 0
MNIST = datasets.MNIST
class MLPClassifier(nn.Module):
def __init__(self):
super(MLPClassifier, self).__init__()
self.layer_1 = torch.nn.Linear(28 * 28, 128)
self.layer_2 = torch.nn.Linear(128, 444)
self.layer_3 = torch.nn.Linear(444, 333)
self.layer_4 = torch.nn.Linear(333, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.layer_1(x)
x = F.relu(x)
x = self.layer_2(x)
x = F.relu(x)
x = self.layer_3(x)
x = F.relu(x)
x = self.layer_4(x)
return x
# Download data
if global_rank == 0:
mnist_train = MNIST(os.getcwd(), train=True, download=True)
mnist_test = MNIST(os.getcwd(), train=False, download=True)
# dist.barrier()
#transforms
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])
mnist_train = MNIST(os.getcwd(), train=True, transform=transform)
# Split dataset
mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
mnist_test = MNIST(os.getcwd(), train=False, download=True)
# Build dataloaders
mnist_train = DataLoader(mnist_train, batch_size=256)
mnist_val = DataLoader(mnist_val, batch_size=256)
mnist_test = DataLoader(mnist_test, batch_size=256)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MLPClassifier()
model = nn.DataParallel(model)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# Train loop
model.train()
num_epochs = 1
for epoch in range(num_epochs):
for train_batch in mnist_train:
x, y = train_batch
logits = model(x.to(device))
loss = F.cross_entropy(logits, y.to(device))
print('rain loss: ', loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
# EVAL LOOP
model.eval()
with torch.no_grad():
val_loss_a = []
for val_batch in mnist_val:
x, y = val_batch
logits = model(x.to(device))
val_loss = F.cross_entropy(logits, y.to(device))
val_loss_a.append(val_loss)
avg_val_loss = torch.stack(val_loss_a).mean()
model.train()

How does one implement a meta-trainable step size in Pytorch?

I want to implement a (meta) trainable step size. I tried it with this post:
https://discuss.pytorch.org/t/how-does-one-have-the-parameters-of-a-model-not-be-leafs/70076/17
and with the higher library (https://github.com/facebookresearch/higher) with no luck...
I tried:
eta = torch.tensor([0.5], requires_grad=True).view(1)
inner_opt = torch.optim.Adam(child_model.parameters(), lr=eta)
#meta_params = itertools.chain(child_model.parameters(),eta.parameters())
meta_params = itertools.chain(child_model.parameters())
meta_opt = torch.optim.Adam(meta_params, lr=1e-3)
# do meta-training/outer training minimize outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
nb_outer_steps = 10 # note, in this case it's the same as number of meta-train steps (but it's could not be the same depending how you loop through the val set)
for outer_i, (outer_inputs, outer_targets) in enumerate(testloader, 0):
meta_opt.zero_grad()
if outer_i >= nb_outer_steps:
break
# do inner-training/MAML; minimize innerloop: theta^{T} - eta* Grad L^train(theta^{T}) ~ argmin L^train(theta)
nb_inner_steps = 3
with higher.innerloop_ctx(child_model, inner_opt) as (fmodel, diffopt):
with error:
Exception has occurred: RuntimeError
Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
which wouldn't work anyway cuz eta might become negative suddenly so I really want to cap it with a sigmoid function but had to try something...
It thinks my step size NN is not in the graph but it is because of this line of code:
p_new = p + lr*g
group['params'][p_idx] = p_new
but somehow that is not enough to have gradients...
Full script self contained script:
import torch
import torch.nn as nn
from torch.optim.optimizer import Optimizer
import higher
from higher.optim import DifferentiableOptimizer
from higher.optim import DifferentiableSGD
import torchvision
import torchvision.transforms as transforms
from torchviz import make_dot
import copy
import itertools
from collections import OrderedDict
#mini class to add a flatten layer to the ordered dictionary
class Flatten(nn.Module):
def forward(self, input):
'''
Note that input.size(0) is usually the batch size.
So what it does is that given any input with input.size(0) # of batches,
will flatten to be 1 * nb_elements.
'''
batch_size = input.size(0)
out = input.view(batch_size,-1)
return out # (batch_size, *size)
def get_cifar10():
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
shuffle=False, num_workers=2)
return trainloader, testloader
class MySGD(Optimizer):
def __init__(self, params, eta, prev_lr):
defaults = {'eta':eta, 'prev_lr':prev_lr}
super().__init__(params, defaults)
class TrainableSGD(DifferentiableOptimizer):
def _update(self, grouped_grads, **kwargs):
prev_lr = self.param_groups[0]['prev_lr']
eta = self.param_groups[0]['eta']
# start differentiable & trainable update
zipped = zip(self.param_groups, grouped_grads)
lr = 0.1*eta(prev_lr).view(1)
for group_idx, (group, grads) in enumerate(zipped):
for p_idx, (p, g) in enumerate(zip(group['params'], grads)):
if g is None:
continue
#group['params'][p_idx] = _add(p, -group['lr'], g)
p_new = p + lr*g
group['params'][p_idx] = p_new
# fake returns
self.param_groups[0]['prev_lr'] = lr
higher.register_optim(MySGD, TrainableSGD)
def main():
# get dataloaders
trainloader, testloader = get_cifar10()
criterion = nn.CrossEntropyLoss()
child_model = nn.Sequential(OrderedDict([
('conv1', nn.Conv2d(in_channels=3,out_channels=2,kernel_size=5)),
('relu1', nn.ReLU()),
('Flatten', Flatten()),
('fc', nn.Linear(in_features=28*28*2,out_features=10) )
]))
hidden = torch.randn(size=(1,1),requires_grad=True)
print(f'-> hidden = {hidden}')
eta = nn.Sequential(OrderedDict([
('fc', nn.Linear(1,1)),
('sigmoid', nn.Sigmoid())
]))
inner_opt = MySGD(child_model.parameters(), eta=eta, prev_lr=hidden)
meta_params = itertools.chain(child_model.parameters(),eta.parameters())
#meta_params = itertools.chain(eta.parameters(),[hidden])
meta_opt = torch.optim.Adam(meta_params, lr=1e-3)
# do meta-training/outer training minimize outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
print()
nb_outer_steps = 1 # note, in this case it's the same as number of meta-train steps (but it's could not be the same depending how you loop through the val set)
for outer_i, (outer_inputs, outer_targets) in enumerate(testloader, 0):
meta_opt.zero_grad()
if outer_i >= nb_outer_steps:
break
# do inner-training/MAML; minimize innerloop: theta^{T} - eta * Grad L^train(theta^{T}) ~ argmin L^train(theta)
nb_inner_steps = 3
#with higher.innerloop_ctx(child_model, inner_opt, copy_initial_weights=False) as (fmodel, diffopt):
with higher.innerloop_ctx(child_model, inner_opt) as (fmodel, diffopt):
for inner_i, (inner_inputs, inner_targets) in enumerate(trainloader, 0):
if inner_i >= nb_inner_steps:
break
logits = fmodel(inner_inputs)
inner_loss = criterion(logits, inner_targets)
print(f'--> inner_i = {inner_i}')
print(f'inner_loss^<{inner_i}>: {inner_loss}')
print(f'lr^<{inner_i-1}> = {diffopt.param_groups[0]["prev_lr"]}')
diffopt.step(inner_loss) # changes params P[t+1] using P[t] and loss[t] in a differentiable manner
print(f'lr^<{inner_i}> = {diffopt.param_groups[0]["prev_lr"]}')
print()
# compute the meta-loss L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
outer_outputs = fmodel(outer_inputs)
meta_loss = criterion(outer_outputs, outer_targets) # L^val
make_dot(meta_loss).render('meta_loss',format='png')
meta_loss.backward()
#grad_of_grads = torch.autograd.grad(outputs=meta_loss, inputs=eta.parameters()) # dmeta_loss/dw0
print(f'----> outer_i = {outer_i}')
print(f'-> outer_loss/meta_loss^<{outer_i}>: {meta_loss}')
print(f'child_model.fc.weight.grad = {child_model.fc.weight.grad}')
print(f'hidden.grad = {hidden.grad}')
print(f'eta.fc.weight = {eta.fc.weight.grad}')
meta_opt.step() # meta-optimizer step: more or less theta^<t> := theta^<t> - meta_eta * Grad L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
if __name__ == "__main__":
main()
print('---> Done\a')
notice the None's:
Files already downloaded and verifiedFiles already downloaded and verified
-> hidden = tensor([[0.8459]], requires_grad=True)
--> inner_i = 0
inner_loss^<0>: 2.2696359157562256
lr^<-1> = tensor([[0.8459]], requires_grad=True)
lr^<0> = tensor([0.0567], grad_fn=)
--> inner_i = 1
inner_loss^<1>: 2.0114920139312744
lr^<0> = tensor([0.0567], grad_fn=)
lr^<1> = tensor([0.0720], grad_fn=)
--> inner_i = 2
inner_loss^<2>: 2.3866422176361084
lr^<1> = tensor([0.0720], grad_fn=)
lr^<2> = tensor([0.0717], grad_fn=)
----> outer_i = 0
-> outer_loss/meta_loss^<0>: 4.021303176879883
child_model.fc.weight.grad = None
hidden.grad = None
eta.fc.weight = None
---> Done
related:
pytorch forum: https://discuss.pytorch.org/t/implement-a-meta-trainable-step-size/70396
gitissue: https://github.com/facebookresearch/higher/issues/32
related SO Q: How does one have parameters in a pytorch model not be leafs and be in the computation graph?

NLP model's accuracy stuck on 0.5098 while training

I built a two layered LSTM model(keras model) for a movie review dataset from kaggle : Dataset
While training the model, every epoch was giving the same accuracy of 0.5098.
Then I thought it might not be learning the long distance dependencies.Then instead of LSTM I used bidirectional LSTM. But, still model's accuracy while training was 0.5098 for every epoch. I trained the model for 8 hours/35 epochs on CPU. Then I stopped training.
Code:
import pandas as pd
from sentiment_utils import *
import keras
import keras.backend as k
import numpy as np
train_data = pd.read_table('train.tsv')
X_train = train_data.iloc[:,2]
Y_train = train_data.iloc[:,3]
from sklearn.preprocessing import OneHotEncoder
Y_train = Y_train.reshape(Y_train.shape[0],1)
ohe = OneHotEncoder(categorical_features=[0])
Y_train = ohe.fit_transform(Y_train).toarray()
maxLen = len(max(X_train, key=len).split())
words_to_index, index_to_words, word_to_vec_map = read_glove_vectors("glove/glove.6B.50d.txt")
m = X_train.shape[0]
def read_glove_vectors(path):
with open(path, encoding='utf8') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
cur_word = line[0]
words.add(cur_word)
word_to_vec_map[cur_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
def sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys):
m = X_train.shape[0]
X_indices = np.zeros((m, maxLen))
for i in range(m):
if i in dash_index_list:
continue
sentance_words = X_train[i].lower().strip().split()
j = 0
for word in sentance_words:
if word in keys:
X_indices[i, j] = words_to_index[word]
j += 1
return X_indices
def pretrained_embedding_layer(word_to_vec_map, words_to_index):
emb_dim = word_to_vec_map['pen'].shape[0]
vocab_size = len(words_to_index) + 1
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, index in words_to_index.items():
emb_matrix[index, :] = word_to_vec_map[word]
emb_layer= keras.layers.embeddings.Embedding(vocab_size, emb_dim, trainable= False)
emb_layer.build((None,))
emb_layer.set_weights([emb_matrix])
return emb_layer
def get_model(input_shape, word_to_vec_map, words_to_index):
sentance_indices = keras.layers.Input(shape = input_shape, dtype='int32')
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
embeddings = embedding_layer(sentance_indices)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(embeddings)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Dense(5)(X)
X = keras.layers.Activation('softmax')(X)
model = keras.models.Model(sentance_indices, X)
return model
model = get_model((maxLen,), word_to_vec_map,words_to_index)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
dash_index_list = []
for i in range(m):
if '-' in X_train[i]:
dash_index_list.append(i)
keys = []
for key in word_to_vec_map.keys():
keys.append(key)
X_train_indices = sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys)
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32, shuffle=True)
I think the way you defined the model architecture doesn't make sense! Try looking at this example on IMDB movie reviews with LSTM on Keras github repo: Trains an LSTM model on the IMDB sentiment classification task.

Resources