I am running quite a large parameter search using TuneGridSearchCV on an xgboost model using my university's HPC cluster. The results are being saved to ~/ray_results however I don't have enough space to save all the files to the home directory as per the HPC policy. How can I move ray_results to a different folder that has more space? I've looked into the the documentation but I am confused about how to do it.
My code is as follows:
import numpy as np
import pandas as pd
from pandas import MultiIndex, Int16Dtype
from sklearnex import patch_sklearn
patch_sklearn()
import xgboost as xgb
from tune_sklearn import TuneGridSearchCV
from datetime import datetime
import sys
if __name__ == "__main__":
df_train = pd.read_excel('my_dataset.xlsx')
train_cols = df_train.columns[df_train.columns != 'Response']
X_train = pd.DataFrame(df_train, columns=train_cols)
y_train = pd.DataFrame(df_train, columns=['Response'])
params = {
"n_estimators" : list(range(100, 1400, 100)),
"max_depth" : list(range(2, 20, 2)),
"min_child_weight" : list(range(2, 20, 2)),
"gamma" : np.arange(0, 1.05, 0.1),
"colsample_bytree" : np.arange(0.5, 1.05, 0.1),
"colsample_bylevel" : np.arange(0.5, 1.05, 0.1),
'reg_lambda': [0.1, 1.0, 5.0, 10.0, 25.0, 50.0]
}
xgb_model = xgb.XGBClassifier(seed=0, use_label_encoder = False, tree_method = 'hist')
print(params)
grid_cv = TuneGridSearchCV(xgb_model, param_grid = params, cv = 5, n_jobs = -1, scoring='roc_auc')
current_time = datetime.now().strftime("%H:%M:%S")
print("Start Time =", current_time)
print('\n')
grid_cv.fit(X_train, y_train.values.ravel())
current_time = datetime.now().strftime("%H:%M:%S")
print('End Time: ', current_time)
print('\n\n')
print('Grid best score (roc_auc): ')
print(grid_cv.best_score_)
print('\n\n')
print('Grid best hyperparameters: ')
print(grid_cv.best_params_)
print('\n\n')
Alternatively, instead of creating a folder for every single parameter combination (which is what it is doing), is there a way to change the format of the output to be more space efficient?
You should be able to set this with TuneGridSearchCV(local_dir="YOUR_PATH").
Related
I am using the Chexpert dataset (found here on kaggle) to build a CNN model that can predict disease conditions (e.g. cardiomegaly, pleural effusion, atelectasis, etc) from chest x-ray image. I am using PyTorch lightning and my code is attached to this question. I have tried several architectures and I don’t seem to get the models to perform well. I performed the overfit test (in which I try to overfit a model on one batch of data) and the models were able to overfit the single batch - showing that they are capable of fitting the data. However, regardless of the architecture I use, there is quite a difference between training loss (which can get as low as 0.2) and validation (which can get as low as 0.49). On sensitivity and precision (the metrics I am interested in), the models perform terribly during validation. After leaving the models for longer epochs, I also observed that the loss values start to increase. I will appreciate any help or suggestion to help me solve this problem. Thank you.
This is my code:
import torch
import torch.nn as nn
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.metrics import roc_auc_score
from pytorch_lightning.trainer.states import RunningStage, TrainerFn, TrainerState, TrainerStatus
import numpy as np
import time
import pandas as pd
import gc
import random
from chexpertloader import ChestXrayDataset
# from ipykernel import kernelapp as app
from torch.utils.tensorboard import SummaryWriter
import torchmetrics
from torchmetrics import AUROC
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import confusion_matrix
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
backbones = {
'efficientnetb0': models.efficientnet_b0(weights='IMAGENET1K_V1'),
'efficientnetb1': models.efficientnet_b1(weights='IMAGENET1K_V1'),
'efficientnetb2': models.efficientnet_b2(weights='IMAGENET1K_V1'),
'efficientnetb3': models.efficientnet_b3(weights='IMAGENET1K_V1'),
'efficientnetb4': models.efficientnet_b4(weights='IMAGENET1K_V1'),
'efficientnetb5': models.efficientnet_b5(weights='IMAGENET1K_V1'),
'efficientnetb6': models.efficientnet_b6(weights='IMAGENET1K_V1'),
'efficientnetb7': models.efficientnet_b7(weights='IMAGENET1K_V1'),
'densenet121': models.densenet121(weights='IMAGENET1K_V1'),
'densenet161': models.densenet161(weights='IMAGENET1K_V1'),
'densenet169': models.densenet169(weights='IMAGENET1K_V1'),
'densenet201': models.densenet201(weights='IMAGENET1K_V1'),
'resnet50': models.resnet50(weights='IMAGENET1K_V1'),
'efficientnetV2_m': models.efficientnet_v2_m(weights='IMAGENET1K_V1')
}
class LitEfficientNet(pl.LightningModule):
def __init__(self, arch, num_classes, lr):
super(LitEfficientNet, self).__init__()
self.arch = arch
self.lr = lr
self.sizes = {
'efficientnetb0': (256, 224), 'efficientnetb1': (256, 240), 'efficientnetb2': (288, 288), 'efficientnetb3': (320, 300),
'efficientnetb4': (384, 380), 'efficientnetb5': (489, 456), 'efficientnetb6': (561, 528), 'efficientnetb7': (633, 600),
'densenet121':(256,256), 'densenet161':(256,256), 'densenet169':(256,256), 'densenet201':(256,256),
'resnet50':(224,224), 'efficientnetV2_m':(384,384)
}
self.batch_sizes = {
'efficientnetb0': 64, 'efficientnetb1': 64, 'efficientnetb2': 64, 'efficientnetb3': 32,
'efficientnetb4': 20, 'efficientnetb5': 7, 'efficientnetb6': 5, 'efficientnetb7': 2,
'densenet121':64, 'densenet161':32, 'densenet169':32, 'densenet201':32, 'resnet50':32,
'efficientnetV2_m':16
}
self.model = backbones[arch]
if 'densenet' in arch:
self.model.classifier = nn.Sequential(
nn.Linear(self.model.classifier.in_features, 2048),
nn.ReLU(),
nn.Dropout(p=0.6),
nn.Linear(2048, 512),
nn.ReLU(),
nn.Dropout(p=0.2),
nn.Linear(512, num_classes),
)
elif 'resnet' in arch:
self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
elif 'efficientnet' in arch:
self.model.classifier = nn.Sequential(
nn.Dropout(p=self.model.classifier[0].p, inplace=True),
nn.Linear(self.model.classifier[1].in_features, num_classes),
)
def forward(self, x):
y_pred = self.model(x)
return y_pred
def training_step(self, batch, batch_idx):
images, labels = batch
# Forward pass
m = nn.Sigmoid()
outputs = self.model(images)
classes = {0:'Cardiomegaly', 1:'Edema', 2:'Atelectasis',
3:'Pleural Effuion', 4:'Lung Opacity'
}
Loss = nn.BCEWithLogitsLoss()
loss = Loss(outputs, labels)
self.log('train_loss', loss, sync_dist=True)
return loss
def train_dataloader(self):
train_csv = pd.read_csv('CheXpert-v1.0-small/train.csv')
train_csv.fillna(0, inplace=True)
train_dataset = ChestXrayDataset("CheXpert-v1.0-small/train", train_csv, self.sizes[self.arch], True)
# Data loader
train_loader = torch.utils.data.DataLoader(
dataset=train_dataset, batch_size=self.batch_sizes[self.arch], num_workers=8, shuffle=False
)
return train_loader
def validation_step(self, batch, batch_idx):
images, labels = batch
images = images
m = nn.Sigmoid()
outputs = self.model(images)
classes = {0:'Cardiomegaly', 1:'Edema', 2:'Atelectasis',
3:'Pleural Effuion', 4:'Lung Opacity'
}
Loss = nn.BCEWithLogitsLoss()
loss = Loss(outputs, labels)
self.log('val_loss', loss, sync_dist=True)
tensorboard_logs = {'val_loss': loss}
return loss
def val_dataloader(self):
validation_csv = pd.read_csv('CheXpert-v1.0-small/valid.csv')
validation_csv.fillna(0, inplace=True)
validation_csv = validation_csv.sample(frac=1)
validation_dataset = ChestXrayDataset("CheXpert-v1.0-small/valid", validation_csv, self.sizes[self.arch], True)
# Data loader
validation_loader = torch.utils.data.DataLoader(
dataset=validation_dataset, batch_size=self.batch_sizes[self.arch], num_workers=8, shuffle=False
)
return validation_loader
def configure_optimizers(self):
optimizer = optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
if __name__ == '__main__':
archs = ['efficientnetV2_m']
learning_rates = [0.001]
num_classes = 5
for i in range(len(learning_rates)):
arch = archs[0]
learning_rate = learning_rates[i]
logger = TensorBoardLogger(f"tb_logs_binary/{arch}",name=f"{arch}_{learning_rate}_ppv_npv_sensitive")
model = LitEfficientNet(arch,num_classes, learning_rate)
trainer = Trainer(
log_every_n_steps=1411,
logger=logger,
accelerator='gpu',
devices=-1,
# devices=1,
# overfit_batches=10,
max_epochs=50,
val_check_interval=0.1,
deterministic=True,
fast_dev_run=False)
trainer.fit(model)
del model, trainer
gc.collect()
Could anyone suggest an elegant way to eliminate the redundant reruns in a pipeline. The example below shows that MyLogistic.fit() is called 37 times (36 for the CV + 1 on the whole data set). For instance, if we take the hyperparameters:
(0.2, 1, 0.01)
(0.2, 1, 0.5)
then it's not necessary to rerun the first step, MyLogistic, with C = 0.2, because its result will be the same (let's ignore the random seed issue). The same reasoning applies to any combination of the first k hyperparameters: once we got the result from the first k steps, we can recycle it.
Thanks to Ben's suggestion, I used the "memory" argument in Pipeline and the count of calls got reduced to 12 (13). Apparently that's because there are 4 combinations of select_k* parameters, times 3 folds. But, for C = 0.02 there is no need to call MyLogistic() twice for max_features 1 and 2. Is there a more or less elegant way to avoid
those redundant calls also?
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (GridSearchCV, StratifiedKFold)
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
import numpy as np
class MyLogistic(LogisticRegression):
__call_counter = 0
def fit(self, X, y, sample_weight=None):
print("MyLogistic fit is called.")
MyLogistic._MyLogistic__call_counter += 1
# fit() returns self.
return super().fit(X, y, sample_weight)
# If I use this "extension", everything works fine.
#class MyLogistic(LogisticRegression):
# pass
initial_logistic = MyLogistic(solver="liblinear", random_state = np.random.RandomState(18))
final_logistic = LogisticRegression(solver="liblinear", random_state = np.random.RandomState(20))
# prefit = False by default
select_best = SelectFromModel(estimator = initial_logistic, threshold = -np.inf)
select_k_best_pipeline = Pipeline(steps=[
('first_scaler', StandardScaler(with_mean = False)),
# initial_logistic will be called from select_best, prefit = false by default.
('select_k_best', select_best),
('final_logit', final_logistic)
])
select_best_grid = {'select_k_best__estimator__C' : [0.02, 0.03],
'select_k_best__max_features': [1, 2],
'final_logit__C' : [0.01, 0.5, 1.0]}
skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 17)
# Use n_jobs = 1 to get the correct count of calls
logit_best_searcher = GridSearchCV(estimator = select_k_best_pipeline, param_grid = select_best_grid, cv = skf,
scoring = "roc_auc", n_jobs = 1, verbose = 4)
X, y = load_iris(return_X_y=True)
logit_best_searcher.fit(X, y > 0)
print("Best hyperparams: ", logit_best_searcher.best_params_)
print("The count of calls is: ", MyLogistic._MyLogistic__call_counter)
my problem is that I always get the following error when I operate the following code.Strange thing is that, when i set the epochs to 0 the error dosnt show up and I can upload with no problems. Thanks for the Help!
I have already tried anabling third party cockies, which did not help. The strange thing is, that the upload works, if I set the training epochs to 0.
Sometimes the error is google.colab._files is undefined.
I have already tried to use Chrome and Firefox.
import tensorflow as tf
import numpy as np
mnist = tf.keras.datasets.fashion_mnist
(training_images, training_labels), (test_images, test_labels) = mnist.load_data()
training_images = training_images.reshape(60000, 28, 28, 1)
training_images = training_images / 255.0
test_images = test_images.reshape(10000, 28, 28, 1)
test_images = test_images / 255.0
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(32,(3,3),activation='relu', input_shape=(28,28,1)),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64,(3,3),activation='relu'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(training_images,training_labels, epochs=1)
classes = model.predict(test_images)
predicted_classes = np.argmax(classes, axis=1)
print(classes[0])
print(test_labels[0])
mnist = tf.keras.datasets.fashion_mnist
(training_images, training_labels), (test_images, test_labels) = mnist.load_data()
import matplotlib.pyplot as plt
plt.imshow(test_images[0], cmap='Greys_r')
import numpy as np
from google.colab import files
from keras.preprocessing import image
import cv2
import matplotlib.pyplot as plt
uploaded = files.upload()
for fn in uploaded.keys():
path = '/content/' + fn
img = cv2.imread(path)
img = cv2.resize(img,(28,28))
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
x = image.img_to_array(img, dtype=np.float32)
print("top left pixel value:", x[0,0])
if x[0,0] > 250:
# white background
print("needs to be inverted!")
x -= 255
x *= -1
x = x / 255.0
x = x.reshape(1, 28, 28, 1)
plt.imshow(img, cmap='Greys_r')
plt.show()
classes = model.predict(x)
plt.bar(range(10), classes[0])
plt.show()
print("prediction: class", np.argmax(classes[0]))
TypeError: Cannot read property '_uploadFiles' of undefined
So I found out that it works if you use 2 cells one for the neural network and one for the upload feature.
Im new to Tensorflow&ML and following this example:
https://www.tensorflow.org/get_started/tflearn
It works very well until change hidden_units parameter here:
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=3,
model_dir="/tmp/iris_model")
When i try anything, for example hidden_units = [20, 40, 20] or hidden_units = [20] it throws an error.
I tried to find out on my own but unsuccessfully so far and thought someone here can help.
The question is how to chose a number of hidden layers for DNN Classifier and why two my examples above do not work?
Here is a full code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import urllib
import tensorflow as tf
import numpy as np
IRIS_TRAINING = "iris_training.csv"
IRIS_TRAINING_URL = "http://download.tensorflow.org/data/iris_training.csv"
IRIS_TEST = "iris_test.csv"
IRIS_TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
if not os.path.exists(IRIS_TRAINING):
raw = urllib.request.urlopen(IRIS_TRAINING_URL).read()
with open(IRIS_TRAINING,'wb') as f:
f.write(raw)
if not os.path.exists(IRIS_TEST):
raw = urllib.request.urlopen(IRIS_TEST_URL).read()
with open(IRIS_TEST,'wb') as f:
f.write(raw)
# Load datasets.
training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
filename=IRIS_TRAINING,
target_dtype=np.int,
features_dtype=np.float32)
test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
filename=IRIS_TEST,
target_dtype=np.int,
features_dtype=np.float32)
# Specify that all features have real-value data
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=3,
model_dir="/tmp/iris_model")
# Define the training inputs
def get_train_inputs():
x = tf.constant(training_set.data)
y = tf.constant(training_set.target)
return x, y
# Fit model.
classifier.fit(input_fn=get_train_inputs, steps=2000)
# Define the test inputs
def get_test_inputs():
x = tf.constant(test_set.data)
y = tf.constant(test_set.target)
return x, y
# Evaluate accuracy.
accuracy_score = classifier.evaluate(input_fn=get_test_inputs,
steps=1)["accuracy"]
print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
Found it,
if model_dir is not specified than moel works just fine with new hidden_units
I am attempting to use the anneal.arff dataset with Python scikit-learn's semisupervised algorithm LabelPropagation. The anneal dataset is categorical data, so I preprocessed it so that the output class for each item of instance
looks like [0. 0. 1. 0. 0.]. This is a numeric list that encodes the output class
as 5 possible values with 0's everywhere, and 1. in the position of the corresponding class. This is what I would expect.
For semi-supervised learning, most of the training data must be unlabeled, so
I modified the training set so that the unlabeled data has output [-1, -1, -1, -1, -1]. I previously tried just using -1, but the code emits the same error as shown below.
I train the classifier as follows, Y_train includes labeled and "unlabeled" data:
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X, Y_train)
I receive the error shown below after calling the fit method:
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\semi_supervised\label_propagation.py", line 221, in fit
X, y = check_X_y(X, y)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 526, in check_X_y
y = column_or_1d(y, warn=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 562, in column_or_1d
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (538, 5)
This suggests that something is wrong with the shape of my Y_train list,
but this is the correct shape. What am I doing wrong?
Can LabelPropagation take as training data in this form, or does it only
accept unlabeled data as a scalar -1?
--- edit ---
Here is the code that generates the error. I'm sorry about the confusion over algorithms--I want to use both LabelSpreading and LabelPropagation, and choosing one or the other doesn't fix this error.
from scipy.io import arff
import pandas as pd
import numpy as np
import math
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from copy import deepcopy
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
f = "../../Documents/UCI/anneal.arff"
dataAsRecArray, meta = arff.loadarff(f)
dataset_raw = pd.DataFrame.from_records(dataAsRecArray)
dataset = pd.get_dummies(dataset_raw)
class_names = [col for col in dataset.columns if 'class_' in col]
print (dataset.shape)
number_of_output_columns = len(class_names)
print (number_of_output_columns)
def run(name, model, dataset, percent):
# Split-out validation dataset
array = dataset.values
X = array[:, 0:-number_of_output_columns]
Y = array[:, -number_of_output_columns:]
validation_size = 0.40
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
num_samples = len(Y_train)
num_labeled_points = math.floor(percent*num_samples)
indices = np.arange(num_samples)
unlabeled_set = indices[num_labeled_points:]
Y_train[unlabeled_set] = [-1, -1, -1, -1, -1]
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X_train, Y_train)
"""
predicted_labels = lp_model.transduction_[unlabeled_set]
print(predicted_labels[:10])
"""
if __name__ == "__main__":
#percentages = [0.1, 0.2, 0.3, 0.4]
percentages = [0.1]
models = []
models.append(('LS', LabelSpreading()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
for percent in percentages:
run(name, model, dataset, percent)
print ("bye")
Your Y_train has shape (538, 5) but should be 1d. LabelPropagation doesn't support multi-label or multi-output multi-class right now.
The error message could be more informative, though :-/