SGD does not converge if #samples < #features - machine-learning

I'm trying to implement a stochastic gradient descent and it works, as long as the number of sampes are greater than the number of features, otherwise, the loss diverges as seen in the figures, in which i compare the loss to the scikit.learn SGDRegressor.
This is my code:
import math
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
def loss_prime_simple(w,node,feature,data):
x = data[3]
y = data[2]
x_f = x[node][feature]
y_node = y[node]
return -(y_node - w[feature] * x_f) * x_f
def update_weights(w,data,predecs,children,node, learning_rate):
len_features = len(data[3][0])
w_new = np.zeros(len_features)
for feature_ in range(len_features):
w_new[feature_] = loss_prime_simple(w,node,feature_,data)
return w - learning_rate * w_new
def loss_simple2(w,data):
y_p = data[2]
x = data[3]
return ((y_p - np.dot(w,np.array(x).T)) ** 2).sum()
def learn(data,l,features, iterations):
a = []
X = np.array(data[3])
w = np.random.rand(features)
for epoch in range(iterations):
print epoch
for j in range(X.shape[0]):
w = update_weights(w,data,None,None,j, l)
# update learning rate
#learning_rate = learning_rate_0 * (1 + learning_rate_0 * gamma * ( epoch + len(X_standardized) * epoch)) ** (-1)
a.append( loss_simple2(w, data))
return a
fig, ax = plt.subplots(2,2)
X = np.random.randn(1000, 496)
y = np.random.randn(1000)
data = None, None, y, X
l = math.pow(10,-4)
a = learn(data,l,496,200)
ax[0,0].plot(a)
ax[0,0].set_title('my SGD features < samples, learning rate =%s'%l)
b = []
clf = linear_model.SGDRegressor(learning_rate='constant', eta0=l, shuffle=False)
for epoch in range(200):
for x,y_i in zip(X,y):
clf.partial_fit(x,[y_i])
b.append(((y - clf.predict(X))**2).sum())
ax[0,1].plot(b)
ax[0,1].set_title('Sklearn features < samples, learning rate =%s'%l)
X = np.random.randn(10, 496)
y = np.random.randn(10)
data = None, None, y, X
a = learn(data,l,496,1000)
ax[1,0].plot(a)
ax[1,0].set_title('my SGD features > samples, learning rate =%s'%l)
b = []
clf = linear_model.SGDRegressor(learning_rate='constant', eta0=l, shuffle=False)
for epoch in range(1000):
for x,y_i in zip(X,y):
clf.partial_fit(x,[y_i])
b.append(((y - clf.predict(X))**2).sum())
ax[1,1].plot(b)
ax[1,1].set_title('Sklearn features > samples, learning rate =%s'%l)
What am I doing wrong?

Related

How to deal with big dataset when using pyG?

I am a beginer learning to using torch_geometric to build my GNN models. I refered the sample of the pyG example of node classification and build my own dataset, however, I tried to use my GPU to run the code and it tells me that it run out of memory, maybe my dataset is too large to allocate the GPU memory? I don't know. I shared an machine of 8 A100 with my classmates. Could you please give me some suggestions, thank you!
from torch_geometric.nn import GATConv,GCNConv
from torch_geometric.data import Dataset,DataLoader,HeteroData,Data
import torch.nn as nn
from torch_geometric.nn import DataParallel
from torch_geometric.loader import DataListLoader
import torch.nn.functional as F
import torch
import pandas as pd
from transformers import BertTokenizer,BertModel
import pickle
import time
from tqdm import tqdm
from numba import jit
import json
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
plt.grid(True)
plt.grid(color='gray',
linestyle='--',
linewidth=1,
alpha=0.3)
begin = time.time()
punctuation = "!#$%&'\(\)-*+,-./:;<=>?#\\\[\]^_`{|}~():;,。【】·、“”‘’《》\"%……——·"
def dataCleanifier(s):
for i in punctuation:
s.replace(i," ")
s = s.replace(" "," ")
s = s.replace("\n","")
return s
class BertClassifier(nn.Module):
def __init__(self,bertType:str,max_length,tag_size):
super(BertClassifier,self).__init__()
self.bertType = bertType
self.tokenizer = BertTokenizer.from_pretrained(self.bertType)
self.encoder = BertModel.from_pretrained(self.bertType)
self.outputDim = self.encoder.pooler.dense.out_features
self.max_length = max_length
self.tag_size = tag_size
self.dropout = nn.Dropout(0.1)
self.activation = nn.LeakyReLU(0.1)
self.convs = nn.ModuleList(
[nn.Conv2d(1, 512, (k, self.outputDim)) for k in (2,3,4)])
self.fc_cnn = nn.Linear(512 * len((2,3,4)), self.tag_size)
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self,x):
x = self.tokenizer.batch_encode_plus(x,return_tensors="pt",max_length=self.max_length,truncation=True,padding="max_length")
attention = x["attention_mask"]
x = x["input_ids"]
x = x.cuda(2)
x = self.encoder(x,attention_mask=attention.cuda(2))['last_hidden_state'][:]
x = x.unsqueeze(1)
encoded = torch.cat([self.conv_and_pool(x,conv) for conv in self.convs],1)
x = self.fc_cnn(encoded)
x = self.activation(x)
# x = F.softmax(x,dim=1)
return x,encoded
class ContrastiveLoss(nn.Module):
def __init__(self):
super(ContrastiveLoss, self).__init__()
def forward(self,representations,label,y_hat):
n = label.shape[0]
T = 0.5
similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
mask = torch.ones_like(similarity_matrix) * (label.expand(n, n).eq(label.expand(n, n).t()))
mask_no_sim = torch.ones_like(mask) - mask
mask_dui_jiao_0 = torch.ones(n ,n) - torch.eye(n, n )
similarity_matrix = torch.exp(similarity_matrix/T)
similarity_matrix = similarity_matrix*mask_dui_jiao_0
sim = mask*similarity_matrix
no_sim = similarity_matrix - sim
no_sim_sum = torch.sum(no_sim , dim=1)
no_sim_sum_expend = no_sim_sum.repeat(n, 1).T
sim_sum = sim + no_sim_sum_expend
loss = torch.div(sim , sim_sum)
loss = mask_no_sim + loss + torch.eye(n, n )
#接下来就是算一个批次中的loss了
loss = -torch.log(loss) #求-log
loss = torch.sum(torch.sum(loss, dim=1) )/(2*n)+nn.CrossEntropyLoss()(y_hat,label)
return loss
class GAT(nn.Module):
def __init__(self, hidden_channels) -> None:
super().__init__()
self.conv1 = GATConv(data.num_features,hidden_channels)
self.conv2 = GATConv(hidden_channels,9)
self.activation = nn.ReLU()
def forward(self,x,edge_index):
x = self.conv1(x,edge_index)
x = self.activation(x)
# print(x)
# x = F.dropout(x,p=0.2)
x = self.conv2(x,edge_index)
return x
x=None
y=None
edge_index = None
train_mask = None
with open("X.pkl","rb") as f1:
x = pickle.load(f1)
with open("Y.pkl","rb") as f2:
y = pickle.load(f2)
y = y.long()
with open("edge_index.pkl","rb") as f3:
edge_index = pickle.load(f3)
# print(edge_index.shape)
with open("train_mask.pkl","rb") as f4:
train_mask = pickle.load(f4)
data = Data(x=x,y=y,edge_index=edge_index)
data.train_mask = train_mask
model = GAT(hidden_channels=32)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = lr_scheduler.StepLR(optimizer, 100, 0.8)
criterion = ContrastiveLoss()
def train():
model.train()
optimizer.zero_grad() # Clear gradients.
out = model(data.x,data.edge_index) # Perform a single forward pass.
loss = criterion(data.x[data.train_mask], data.y[data.train_mask],out[data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
return loss
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1) # Use the class with highest probability.
test_correct = pred[data.train_mask] == data.y[data.train_mask] # Check against ground-truth labels.
test_acc = int(test_correct.sum()) / int(data.train_mask.sum()) # Derive ratio of correct predictions.
return test_acc
accs = []
for epoch in range(1, 1025):
loss = train()
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}',end=" ")
acc = test()
print("acc:",acc)
accs.append(acc)
scheduler.step()
plt.plot(range(len(accs)),accs)
print(time.time()-begin)
with open("./accs_gat_GCL.pkl","wb") as f1:
pickle.dump(accs,f1)
plt.savefig("./res_GAT_GCL.png",dpi=600)
I have tried to use DataPararel to use multiple GPU to load my model and dataset but failed.

How to use variable learning rate that decreases with loss using pytorch-geometric?

I have the following code snippet from PyTorch geometric example. I want to use a learning rate that decreases as the loss value during training decreases. I tried using scheduler but that didn't work for me.
A clean code-snippet is below. Can anyone provide valuable suggestions or help on this matter?
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import AGNNConv
dataset = 'Cora'
path = 'Cora'
dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lin1 = torch.nn.Linear(dataset.num_features, 16)
self.prop1 = AGNNConv(requires_grad=False)
self.prop2 = AGNNConv(requires_grad=True)
self.lin2 = torch.nn.Linear(16, dataset.num_classes)
def forward(self):
x = F.dropout(data.x, training=self.training)
x = F.relu(self.lin1(x))
x = self.prop1(x, data.edge_index)
x = self.prop2(x, data.edge_index)
x = F.dropout(x, training=self.training)
x = self.lin2(x)
return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
optimizer.step()
def test():
model.eval()
logits, accs = model(), []
for _, mask in data('train_mask', 'val_mask', 'test_mask'):
pred = logits[mask].max(1)[1]
acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
accs.append(acc)
return accs
best_val_acc = test_acc = 0
for epoch in range(1, 201):
train()
train_acc, val_acc, tmp_test_acc = test()
if val_acc > best_val_acc:
best_val_acc = val_acc
test_acc = tmp_test_acc
log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
print(log.format(epoch, train_acc, best_val_acc, test_acc))

Pytorch linear regression loss increase

I tried to implement a simple demo that gets a polynomial regression, but the linear model's loss fails to decrease.
I am confused about where I went wrong.
If I trained the model one sample(batch size = 1) each time, it works fine. but when I feed the model with many samples a time, the loss increase and get inf.
import numpy as np
import torch
import math
from matplotlib import pyplot as plt
def rand_series(size):
x = np.linspace(-100, 100, size)
np.random.shuffle(x)
base_y = 20 * np.sin(2 * math.pi / 200 * x)
y = base_y + 10 * np.random.rand(size)
return x, y
def rescale_vec(vector):
vec_as_tensor = torch.tensor(vector, dtype=torch.float32)
max_in_vec = torch.max(vec_as_tensor)
min_in_vec = torch.min(vec_as_tensor)
if max_in_vec - min_in_vec == 0:
return torch.ones(vec_as_tensor.size(), dtype=torch.float32)
else:
return (vec_as_tensor - min_in_vec) / (max_in_vec - min_in_vec)
def rescale(vectors):
if len(vectors.shape) == 1:
return rescale_vec(vectors)
nor_vecs = torch.empty(vectors.shape)
for i in range(vectors.shape[0]):
nor_vecs[i] = rescale_vec(vectors[i])
return nor_vecs
class LinearRegression (torch.nn.Module):
def __init__ (self, power=4):
super().__init__()
self.layer = torch.nn.Linear(power, 1)
def forward(self, x):
return self.layer(x)
def regression(x_, y_, learning_rate):
x = torch.t(torch.tensor(x_, dtype=torch.float32))
y = torch.tensor(y_, dtype=torch.float32)
dim_size = x.size()[1]
print(dim_size, x.size())
model = LinearRegression(dim_size)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_func = torch.nn.MSELoss(reduction='sum')
batch_size = 400
for round in range(50):
sample_indices = torch.randint(0, len(x), (batch_size, ))
x_samples = torch.index_select(x, 0, sample_indices)
y_samples = torch.index_select(y, 0, sample_indices)
optimizer.zero_grad()
y_hat = model(x_samples.view(-1, dim_size))
loss = loss_func(y_hat, y_samples)
print(loss.item())
loss.backward()
optimizer.step()
return model
x_one, y = rand_series(1000)
b = np.ones(len(x_one))
x = np.array([b, x_one, x_one ** 2, x_one ** 3, x_one ** 4, x_one ** 5])
model = regression(rescale(x), torch.tensor(y, dtype=torch.float32), 0.002)
nor_x = rescale(x)
y_hat = model(torch.t(torch.tensor(x, dtype=torch.float32)))
plt.scatter(x_one, y)
plt.scatter(x_one, y_hat.data, c='red')
plt.show()
the loss:
4.7375866968775066e+19
1.6979300048622735e+26
6.0214270068868396e+32
inf
inf
inf
You need to use loss_func = torch.nn.MSELoss(reduction='mean') to solve the NaN problem. A batch of one or two seems to work because the loss was small enough. By adding more epochs, you should see that your loss tend exponentially to infinity.

Multi GPU training in Pytorch causes my system to freeze

When I wrap my model in nn.DataParallel(model) and start training my screen freezes and I have to manually restart the computer every time.
I've tried a few variations, like not adding .to(device) to every x and y, but whenever nn.DataParallel is used I seem to cause the computer to freeze.
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models, datasets, transforms
import torch.utils.data
DataLoader = torch.utils.data.DataLoader
random_split = torch.utils.data.random_split
global_rank = 0
MNIST = datasets.MNIST
class MLPClassifier(nn.Module):
def __init__(self):
super(MLPClassifier, self).__init__()
self.layer_1 = torch.nn.Linear(28 * 28, 128)
self.layer_2 = torch.nn.Linear(128, 444)
self.layer_3 = torch.nn.Linear(444, 333)
self.layer_4 = torch.nn.Linear(333, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.layer_1(x)
x = F.relu(x)
x = self.layer_2(x)
x = F.relu(x)
x = self.layer_3(x)
x = F.relu(x)
x = self.layer_4(x)
return x
# Download data
if global_rank == 0:
mnist_train = MNIST(os.getcwd(), train=True, download=True)
mnist_test = MNIST(os.getcwd(), train=False, download=True)
# dist.barrier()
#transforms
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])
mnist_train = MNIST(os.getcwd(), train=True, transform=transform)
# Split dataset
mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
mnist_test = MNIST(os.getcwd(), train=False, download=True)
# Build dataloaders
mnist_train = DataLoader(mnist_train, batch_size=256)
mnist_val = DataLoader(mnist_val, batch_size=256)
mnist_test = DataLoader(mnist_test, batch_size=256)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MLPClassifier()
model = nn.DataParallel(model)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# Train loop
model.train()
num_epochs = 1
for epoch in range(num_epochs):
for train_batch in mnist_train:
x, y = train_batch
logits = model(x.to(device))
loss = F.cross_entropy(logits, y.to(device))
print('rain loss: ', loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
# EVAL LOOP
model.eval()
with torch.no_grad():
val_loss_a = []
for val_batch in mnist_val:
x, y = val_batch
logits = model(x.to(device))
val_loss = F.cross_entropy(logits, y.to(device))
val_loss_a.append(val_loss)
avg_val_loss = torch.stack(val_loss_a).mean()
model.train()

NLP model's accuracy stuck on 0.5098 while training

I built a two layered LSTM model(keras model) for a movie review dataset from kaggle : Dataset
While training the model, every epoch was giving the same accuracy of 0.5098.
Then I thought it might not be learning the long distance dependencies.Then instead of LSTM I used bidirectional LSTM. But, still model's accuracy while training was 0.5098 for every epoch. I trained the model for 8 hours/35 epochs on CPU. Then I stopped training.
Code:
import pandas as pd
from sentiment_utils import *
import keras
import keras.backend as k
import numpy as np
train_data = pd.read_table('train.tsv')
X_train = train_data.iloc[:,2]
Y_train = train_data.iloc[:,3]
from sklearn.preprocessing import OneHotEncoder
Y_train = Y_train.reshape(Y_train.shape[0],1)
ohe = OneHotEncoder(categorical_features=[0])
Y_train = ohe.fit_transform(Y_train).toarray()
maxLen = len(max(X_train, key=len).split())
words_to_index, index_to_words, word_to_vec_map = read_glove_vectors("glove/glove.6B.50d.txt")
m = X_train.shape[0]
def read_glove_vectors(path):
with open(path, encoding='utf8') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
cur_word = line[0]
words.add(cur_word)
word_to_vec_map[cur_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
def sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys):
m = X_train.shape[0]
X_indices = np.zeros((m, maxLen))
for i in range(m):
if i in dash_index_list:
continue
sentance_words = X_train[i].lower().strip().split()
j = 0
for word in sentance_words:
if word in keys:
X_indices[i, j] = words_to_index[word]
j += 1
return X_indices
def pretrained_embedding_layer(word_to_vec_map, words_to_index):
emb_dim = word_to_vec_map['pen'].shape[0]
vocab_size = len(words_to_index) + 1
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, index in words_to_index.items():
emb_matrix[index, :] = word_to_vec_map[word]
emb_layer= keras.layers.embeddings.Embedding(vocab_size, emb_dim, trainable= False)
emb_layer.build((None,))
emb_layer.set_weights([emb_matrix])
return emb_layer
def get_model(input_shape, word_to_vec_map, words_to_index):
sentance_indices = keras.layers.Input(shape = input_shape, dtype='int32')
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
embeddings = embedding_layer(sentance_indices)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(embeddings)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Dense(5)(X)
X = keras.layers.Activation('softmax')(X)
model = keras.models.Model(sentance_indices, X)
return model
model = get_model((maxLen,), word_to_vec_map,words_to_index)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
dash_index_list = []
for i in range(m):
if '-' in X_train[i]:
dash_index_list.append(i)
keys = []
for key in word_to_vec_map.keys():
keys.append(key)
X_train_indices = sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys)
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32, shuffle=True)
I think the way you defined the model architecture doesn't make sense! Try looking at this example on IMDB movie reviews with LSTM on Keras github repo: Trains an LSTM model on the IMDB sentiment classification task.

Resources