The gradients are all 0 when backwards and the parameters didnot change at all - machine-learning

I implement the policy gradient method to learn the unknown function( which is a 10 loop sum function here), but the model did not update. The learning data is input and the target. func2 include the MLP model which to predict the target number. The code is the following:
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import torch.optim as opt
import torch
from torch.autograd.variable import Variable
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(1, 8)
self.fc2 = nn.Linear(8, 8)
self.fc3 = nn.Linear(8, 1)
def forward(self, x):
x = self.fc1(x)
x=F.relu(x)
x = self.fc2(x)
x=F.relu(x)
x=self.fc3(x)
x=F.sigmoid(x)
return x
def assertEqual(label, pre_number):
reward = -torch.sum((label - pre_number).pow(2))
return reward
def func_mulsum(model, s,scalar,n_loop):
c=s
for i in range(n_loop):
c = model(c)
return c
def train_test_data(n_loop):
nums=100
rate=0.7
train_data=np.zeros((int(nums*rate),2))
test_data=np.zeros((int(nums*(1-rate)),2))
data=random.sample(range(nums),nums)
train=data[:int(nums*rate)]
test = data[int(nums*rate):]
train_data[:,0]=train
test_data[:,0]=test
for i,ans in enumerate(train):
for j in range(n_loop):
ans+=j
train_data[i,1]=ans
for i,ans1 in enumerate(test):
for j in range(n_loop):
ans1+=j
test_data[i,1]=ans1
return train_data,test_data
if __name__ == '__main__':
n_loop=10
iterations=10
learn_data, test_case = train_test_data(n_loop)
model=MLP()
optim=opt.SGD(model.parameters(),lr=0.05)
for i in range(iterations):
reward_sum=0
learn_data = torch.FloatTensor(learn_data)
for j,data in enumerate(learn_data[:,0]):
data=data.unsqueeze(0)
label=learn_data[j,1]
optim.zero_grad()
pre=func_mulsum(model,data,255,n_loop)
p_norm = torch.normal(pre, std=0.000000001)
reward = assertEqual(p_norm,label)
# print(p_norm,label)
loss=reward*(torch.log(p_norm)*-1)
loss.backward()
reward_sum += loss
optim.step()
for para in model.parameters():
pass
print(para)
print('reward_mean............................', reward_sum)
I could not find the reason why the gradients are all 0. this question confused me for 2 days. Anyone can help me?

Related

How to deal with big dataset when using pyG?

I am a beginer learning to using torch_geometric to build my GNN models. I refered the sample of the pyG example of node classification and build my own dataset, however, I tried to use my GPU to run the code and it tells me that it run out of memory, maybe my dataset is too large to allocate the GPU memory? I don't know. I shared an machine of 8 A100 with my classmates. Could you please give me some suggestions, thank you!
from torch_geometric.nn import GATConv,GCNConv
from torch_geometric.data import Dataset,DataLoader,HeteroData,Data
import torch.nn as nn
from torch_geometric.nn import DataParallel
from torch_geometric.loader import DataListLoader
import torch.nn.functional as F
import torch
import pandas as pd
from transformers import BertTokenizer,BertModel
import pickle
import time
from tqdm import tqdm
from numba import jit
import json
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
plt.grid(True)
plt.grid(color='gray',
linestyle='--',
linewidth=1,
alpha=0.3)
begin = time.time()
punctuation = "!#$%&'\(\)-*+,-./:;<=>?#\\\[\]^_`{|}~():;,。【】·、“”‘’《》\"%……——·"
def dataCleanifier(s):
for i in punctuation:
s.replace(i," ")
s = s.replace(" "," ")
s = s.replace("\n","")
return s
class BertClassifier(nn.Module):
def __init__(self,bertType:str,max_length,tag_size):
super(BertClassifier,self).__init__()
self.bertType = bertType
self.tokenizer = BertTokenizer.from_pretrained(self.bertType)
self.encoder = BertModel.from_pretrained(self.bertType)
self.outputDim = self.encoder.pooler.dense.out_features
self.max_length = max_length
self.tag_size = tag_size
self.dropout = nn.Dropout(0.1)
self.activation = nn.LeakyReLU(0.1)
self.convs = nn.ModuleList(
[nn.Conv2d(1, 512, (k, self.outputDim)) for k in (2,3,4)])
self.fc_cnn = nn.Linear(512 * len((2,3,4)), self.tag_size)
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self,x):
x = self.tokenizer.batch_encode_plus(x,return_tensors="pt",max_length=self.max_length,truncation=True,padding="max_length")
attention = x["attention_mask"]
x = x["input_ids"]
x = x.cuda(2)
x = self.encoder(x,attention_mask=attention.cuda(2))['last_hidden_state'][:]
x = x.unsqueeze(1)
encoded = torch.cat([self.conv_and_pool(x,conv) for conv in self.convs],1)
x = self.fc_cnn(encoded)
x = self.activation(x)
# x = F.softmax(x,dim=1)
return x,encoded
class ContrastiveLoss(nn.Module):
def __init__(self):
super(ContrastiveLoss, self).__init__()
def forward(self,representations,label,y_hat):
n = label.shape[0]
T = 0.5
similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
mask = torch.ones_like(similarity_matrix) * (label.expand(n, n).eq(label.expand(n, n).t()))
mask_no_sim = torch.ones_like(mask) - mask
mask_dui_jiao_0 = torch.ones(n ,n) - torch.eye(n, n )
similarity_matrix = torch.exp(similarity_matrix/T)
similarity_matrix = similarity_matrix*mask_dui_jiao_0
sim = mask*similarity_matrix
no_sim = similarity_matrix - sim
no_sim_sum = torch.sum(no_sim , dim=1)
no_sim_sum_expend = no_sim_sum.repeat(n, 1).T
sim_sum = sim + no_sim_sum_expend
loss = torch.div(sim , sim_sum)
loss = mask_no_sim + loss + torch.eye(n, n )
#接下来就是算一个批次中的loss了
loss = -torch.log(loss) #求-log
loss = torch.sum(torch.sum(loss, dim=1) )/(2*n)+nn.CrossEntropyLoss()(y_hat,label)
return loss
class GAT(nn.Module):
def __init__(self, hidden_channels) -> None:
super().__init__()
self.conv1 = GATConv(data.num_features,hidden_channels)
self.conv2 = GATConv(hidden_channels,9)
self.activation = nn.ReLU()
def forward(self,x,edge_index):
x = self.conv1(x,edge_index)
x = self.activation(x)
# print(x)
# x = F.dropout(x,p=0.2)
x = self.conv2(x,edge_index)
return x
x=None
y=None
edge_index = None
train_mask = None
with open("X.pkl","rb") as f1:
x = pickle.load(f1)
with open("Y.pkl","rb") as f2:
y = pickle.load(f2)
y = y.long()
with open("edge_index.pkl","rb") as f3:
edge_index = pickle.load(f3)
# print(edge_index.shape)
with open("train_mask.pkl","rb") as f4:
train_mask = pickle.load(f4)
data = Data(x=x,y=y,edge_index=edge_index)
data.train_mask = train_mask
model = GAT(hidden_channels=32)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = lr_scheduler.StepLR(optimizer, 100, 0.8)
criterion = ContrastiveLoss()
def train():
model.train()
optimizer.zero_grad() # Clear gradients.
out = model(data.x,data.edge_index) # Perform a single forward pass.
loss = criterion(data.x[data.train_mask], data.y[data.train_mask],out[data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
return loss
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1) # Use the class with highest probability.
test_correct = pred[data.train_mask] == data.y[data.train_mask] # Check against ground-truth labels.
test_acc = int(test_correct.sum()) / int(data.train_mask.sum()) # Derive ratio of correct predictions.
return test_acc
accs = []
for epoch in range(1, 1025):
loss = train()
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}',end=" ")
acc = test()
print("acc:",acc)
accs.append(acc)
scheduler.step()
plt.plot(range(len(accs)),accs)
print(time.time()-begin)
with open("./accs_gat_GCL.pkl","wb") as f1:
pickle.dump(accs,f1)
plt.savefig("./res_GAT_GCL.png",dpi=600)
I have tried to use DataPararel to use multiple GPU to load my model and dataset but failed.

How to build a model to diagnose chest conditions from Chexpert dataset

I am using the Chexpert dataset (found here on kaggle) to build a CNN model that can predict disease conditions (e.g. cardiomegaly, pleural effusion, atelectasis, etc) from chest x-ray image. I am using PyTorch lightning and my code is attached to this question. I have tried several architectures and I don’t seem to get the models to perform well. I performed the overfit test (in which I try to overfit a model on one batch of data) and the models were able to overfit the single batch - showing that they are capable of fitting the data. However, regardless of the architecture I use, there is quite a difference between training loss (which can get as low as 0.2) and validation (which can get as low as 0.49). On sensitivity and precision (the metrics I am interested in), the models perform terribly during validation. After leaving the models for longer epochs, I also observed that the loss values start to increase. I will appreciate any help or suggestion to help me solve this problem. Thank you.
This is my code:
import torch
import torch.nn as nn
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.metrics import roc_auc_score
from pytorch_lightning.trainer.states import RunningStage, TrainerFn, TrainerState, TrainerStatus
import numpy as np
import time
import pandas as pd
import gc
import random
from chexpertloader import ChestXrayDataset
# from ipykernel import kernelapp as app
from torch.utils.tensorboard import SummaryWriter
import torchmetrics
from torchmetrics import AUROC
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import confusion_matrix
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
backbones = {
'efficientnetb0': models.efficientnet_b0(weights='IMAGENET1K_V1'),
'efficientnetb1': models.efficientnet_b1(weights='IMAGENET1K_V1'),
'efficientnetb2': models.efficientnet_b2(weights='IMAGENET1K_V1'),
'efficientnetb3': models.efficientnet_b3(weights='IMAGENET1K_V1'),
'efficientnetb4': models.efficientnet_b4(weights='IMAGENET1K_V1'),
'efficientnetb5': models.efficientnet_b5(weights='IMAGENET1K_V1'),
'efficientnetb6': models.efficientnet_b6(weights='IMAGENET1K_V1'),
'efficientnetb7': models.efficientnet_b7(weights='IMAGENET1K_V1'),
'densenet121': models.densenet121(weights='IMAGENET1K_V1'),
'densenet161': models.densenet161(weights='IMAGENET1K_V1'),
'densenet169': models.densenet169(weights='IMAGENET1K_V1'),
'densenet201': models.densenet201(weights='IMAGENET1K_V1'),
'resnet50': models.resnet50(weights='IMAGENET1K_V1'),
'efficientnetV2_m': models.efficientnet_v2_m(weights='IMAGENET1K_V1')
}
class LitEfficientNet(pl.LightningModule):
def __init__(self, arch, num_classes, lr):
super(LitEfficientNet, self).__init__()
self.arch = arch
self.lr = lr
self.sizes = {
'efficientnetb0': (256, 224), 'efficientnetb1': (256, 240), 'efficientnetb2': (288, 288), 'efficientnetb3': (320, 300),
'efficientnetb4': (384, 380), 'efficientnetb5': (489, 456), 'efficientnetb6': (561, 528), 'efficientnetb7': (633, 600),
'densenet121':(256,256), 'densenet161':(256,256), 'densenet169':(256,256), 'densenet201':(256,256),
'resnet50':(224,224), 'efficientnetV2_m':(384,384)
}
self.batch_sizes = {
'efficientnetb0': 64, 'efficientnetb1': 64, 'efficientnetb2': 64, 'efficientnetb3': 32,
'efficientnetb4': 20, 'efficientnetb5': 7, 'efficientnetb6': 5, 'efficientnetb7': 2,
'densenet121':64, 'densenet161':32, 'densenet169':32, 'densenet201':32, 'resnet50':32,
'efficientnetV2_m':16
}
self.model = backbones[arch]
if 'densenet' in arch:
self.model.classifier = nn.Sequential(
nn.Linear(self.model.classifier.in_features, 2048),
nn.ReLU(),
nn.Dropout(p=0.6),
nn.Linear(2048, 512),
nn.ReLU(),
nn.Dropout(p=0.2),
nn.Linear(512, num_classes),
)
elif 'resnet' in arch:
self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
elif 'efficientnet' in arch:
self.model.classifier = nn.Sequential(
nn.Dropout(p=self.model.classifier[0].p, inplace=True),
nn.Linear(self.model.classifier[1].in_features, num_classes),
)
def forward(self, x):
y_pred = self.model(x)
return y_pred
def training_step(self, batch, batch_idx):
images, labels = batch
# Forward pass
m = nn.Sigmoid()
outputs = self.model(images)
classes = {0:'Cardiomegaly', 1:'Edema', 2:'Atelectasis',
3:'Pleural Effuion', 4:'Lung Opacity'
}
Loss = nn.BCEWithLogitsLoss()
loss = Loss(outputs, labels)
self.log('train_loss', loss, sync_dist=True)
return loss
def train_dataloader(self):
train_csv = pd.read_csv('CheXpert-v1.0-small/train.csv')
train_csv.fillna(0, inplace=True)
train_dataset = ChestXrayDataset("CheXpert-v1.0-small/train", train_csv, self.sizes[self.arch], True)
# Data loader
train_loader = torch.utils.data.DataLoader(
dataset=train_dataset, batch_size=self.batch_sizes[self.arch], num_workers=8, shuffle=False
)
return train_loader
def validation_step(self, batch, batch_idx):
images, labels = batch
images = images
m = nn.Sigmoid()
outputs = self.model(images)
classes = {0:'Cardiomegaly', 1:'Edema', 2:'Atelectasis',
3:'Pleural Effuion', 4:'Lung Opacity'
}
Loss = nn.BCEWithLogitsLoss()
loss = Loss(outputs, labels)
self.log('val_loss', loss, sync_dist=True)
tensorboard_logs = {'val_loss': loss}
return loss
def val_dataloader(self):
validation_csv = pd.read_csv('CheXpert-v1.0-small/valid.csv')
validation_csv.fillna(0, inplace=True)
validation_csv = validation_csv.sample(frac=1)
validation_dataset = ChestXrayDataset("CheXpert-v1.0-small/valid", validation_csv, self.sizes[self.arch], True)
# Data loader
validation_loader = torch.utils.data.DataLoader(
dataset=validation_dataset, batch_size=self.batch_sizes[self.arch], num_workers=8, shuffle=False
)
return validation_loader
def configure_optimizers(self):
optimizer = optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
if __name__ == '__main__':
archs = ['efficientnetV2_m']
learning_rates = [0.001]
num_classes = 5
for i in range(len(learning_rates)):
arch = archs[0]
learning_rate = learning_rates[i]
logger = TensorBoardLogger(f"tb_logs_binary/{arch}",name=f"{arch}_{learning_rate}_ppv_npv_sensitive")
model = LitEfficientNet(arch,num_classes, learning_rate)
trainer = Trainer(
log_every_n_steps=1411,
logger=logger,
accelerator='gpu',
devices=-1,
# devices=1,
# overfit_batches=10,
max_epochs=50,
val_check_interval=0.1,
deterministic=True,
fast_dev_run=False)
trainer.fit(model)
del model, trainer
gc.collect()

How to properly a Pytorch geometric GNN model for my custom toy dataset?

I created my own custom toy dataset of graphs in order to learn graph neural networks in Pytorch-geopmetric (PyG).
The data looks like the following:
Data(x=[20, 1], edge_index=[2, 20], y=[1])
I also created a dataloader as follows:
from torch_geometric.loader import DataLoader
train_dataloader = DataLoader(dataset[0:8000], batch_size=32, shuffle=True)
test_dataloader = DataLoader(dataset[8000:10000], batch_size=32, shuffle=True)
Therefore, a batch will look like:
DataBatch(x=[640, 1], edge_index=[2, 640], y=[32], batch=[640], ptr=[33])
My attempt to make a Graph-CNN:
import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(dataset[0].num_node_features, 16)
self.conv2 = GCNConv(16, 16)
self.out = nn.Linear(16, 1)
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
out = self.out(x)
return out
model = GCN()
When I do something like:
criterion = torch.nn.CrossEntropyLoss()
target = batch.y.to(torch.float32)
loss = criterion(out, target)
loss
I get the error:
ValueError: Expected input batch_size (640) to match target batch_size (32).
Full code is in my github repo here:
https://github.com/amine179/myGNN-learning/blob/main/My%20first%20GCNN.ipynb

How to use variable learning rate that decreases with loss using pytorch-geometric?

I have the following code snippet from PyTorch geometric example. I want to use a learning rate that decreases as the loss value during training decreases. I tried using scheduler but that didn't work for me.
A clean code-snippet is below. Can anyone provide valuable suggestions or help on this matter?
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import AGNNConv
dataset = 'Cora'
path = 'Cora'
dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lin1 = torch.nn.Linear(dataset.num_features, 16)
self.prop1 = AGNNConv(requires_grad=False)
self.prop2 = AGNNConv(requires_grad=True)
self.lin2 = torch.nn.Linear(16, dataset.num_classes)
def forward(self):
x = F.dropout(data.x, training=self.training)
x = F.relu(self.lin1(x))
x = self.prop1(x, data.edge_index)
x = self.prop2(x, data.edge_index)
x = F.dropout(x, training=self.training)
x = self.lin2(x)
return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
optimizer.step()
def test():
model.eval()
logits, accs = model(), []
for _, mask in data('train_mask', 'val_mask', 'test_mask'):
pred = logits[mask].max(1)[1]
acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
accs.append(acc)
return accs
best_val_acc = test_acc = 0
for epoch in range(1, 201):
train()
train_acc, val_acc, tmp_test_acc = test()
if val_acc > best_val_acc:
best_val_acc = val_acc
test_acc = tmp_test_acc
log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
print(log.format(epoch, train_acc, best_val_acc, test_acc))

Custom weight initialisation causing error - pytorch

%reset -f
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import torch.utils.data as data_utils
import torch.nn as nn
import torch.nn.functional as F
num_epochs = 20
x1 = np.array([0,0])
x2 = np.array([0,1])
x3 = np.array([1,0])
x4 = np.array([1,1])
num_epochs = 200
x = torch.tensor([x1,x2,x3,x4]).float()
y = torch.tensor([0,1,1,0]).long()
train = data_utils.TensorDataset(x,y)
train_loader = data_utils.DataLoader(train , batch_size=2 , shuffle=True)
device = 'cpu'
input_size = 2
hidden_size = 100
num_classes = 2
learning_rate = .0001
torch.manual_seed(24)
def weights_init(m):
m.weight.data.normal_(0.0, 1)
class NeuralNet(nn.Module) :
def __init__(self, input_size, hidden_size, num_classes) :
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size , hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size , num_classes)
def forward(self, x) :
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
model = NeuralNet(input_size, hidden_size, num_classes).to(device)
model.apply(weights_init)
criterionCE = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for i in range(0 , 1) :
total_step = len(train_loader)
for epoch in range(num_epochs) :
for i,(images , labels) in enumerate(train_loader) :
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
loss = criterionCE(outputs , labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
outputs = model(x)
print(outputs.data.max(1)[1])
I'm using to initialize the weights :
def weights_init(m):
m.weight.data.normal_(0.0, 1)
But following error is thrown :
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __getattr__(self, name)
533 return modules[name]
534 raise AttributeError("'{}' object has no attribute '{}'".format(
--> 535 type(self).__name__, name))
536
537 def __setattr__(self, name, value):
AttributeError: 'ReLU' object has no attribute 'weight'
Is this the correct method to initialize the weights ?
Also, should object be of type nn.Module , not Relu ?
In addition to what Fabio mentioned about checking the layer type and ReLU being an activation and not a trainable layer, as it is about initialization, you can do the weight initialization in the __init__ method itself like its done here:
https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
def __init__(self, features, num_classes=1000,...):
----snip---
self._initialize_weights()
def _initialize_weights(self):
if isinstance(m, nn.Linear):
m.weight.data.normal_(0.0, 1)
You are trying to set the weights of a weight-free layer (ReLU).
Inside weights_init, you should check the type of layers before initializing weights. For instance:
def weights_init(m):
if type(m) == nn.Linear:
m.weight.data.normal_(0.0, 1)
See How to initialize weights in PyTorch?.

Resources