!pip install hyperopt
from hyperopt import hp
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK
from hyperopt import fmin, tpe, Trials
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')
dataset=load_breast_cancer()
cancer_df=pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
x_features=cancer_df.iloc[:,:-1]
y_label=cancer_df.iloc[:,-1]
train_data, test_data, train_target, test_target=train_test_split(x_features,
y_label,test_size=0.2)
train_data, val_data, train_target, val_target=train_test_split(train_data,
train_target,test_size=0.2)
#XGBClassifier
xgb_search_space={'max_depth':hp.quniform('max_depth',5,20,1),
'min_child_weight':hp.quniform('min_child_weight',1,2,1),
'colsample_bytree':hp.uniform('colsample_bytree',0.5,1),
'learning_rate':hp.uniform('learning_rate',0.01,0.2),
}
def objection_func(search_space):
xgb=XGBClassifier(n_estimators=10, max_depth=int(search_space['max_depth']),
min_child_weight=int(search_space['min_child_weight']),
learning_rate=search_space['learning_rate'],
colsample_bytree=search_space['colsample_bytree'],
eval_metric='mlogloss'
)
accuracy=cross_val_score(xgb, train_data, train_target, scoring='accuracy', cv=5)
#accuracy=cross_validate(xgb, train_data, train_target, return_train_score=True, cv=5)
return {'loss':-1*np.mean(accuracy), 'status':STATUS_OK}
from hyperopt import fmin, tpe, Trials
trials=Trials()
best_01=fmin(
fn=objection_func,
space=xgb_search_space,
algo=tpe.suggest,
max_evals=10,
trials=trials
)
print('best:' ,best_01)
This entire coding including dataset. The error is loss: nan.
I don't know the error(loss: nan)...Where is wrong?
Please help me....................
This entire coding including dataset. The error is loss: nan.
I don't know the error(loss: nan)...Where is wrong?
Please help me....................
Related
`
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from sklearn.feature_selection import SelectKBest, mutual_info_classif
define dataset
X, Y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
define pipeline
steps = [('over', ADASYN()), ('selector', SelectKBest(mutual_info_classif, k=5)),('model', model)]
pipeline = Pipeline(steps=steps)
evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, Y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean((scores))
print('Mean ROC AUC: %.3f' % mean(scores))`
`I am implementing this for multiple models such as naive bayes, decision tree, logistic regression and random forest
and wanted to apply feature selection, adasyn and cross validation using pipeline`
I'm new to the world of machine learning and more generally to AI.
I am analyzing a dataset containing characteristics of different houses and their prices using Python and JupyterLab.
Here is the dataset in use:
https://www.kaggle.com/datasets/harlfoxem/housesalesprediction
I applied random forest (scikit-learn) on this dataset and now I would like to plot the error bars of the model.
Specifically, I'm using the ForestCI package and applying exactly this code to my case:
http://contrib.scikit-learn.org/forest-confidence-interval/auto_examples/plot_mpg.html
This is my code:
# Regression Forest Example
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import r2_score
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
import forestci as fci
#import dataset
mpg_data = pd.read_csv(path_to_dataset)
#drop some useless features
mpg_data=mpg_data.drop('date', axis=1)
mpg_data=mpg_data.drop('yr_built', axis=1)
mpg_data = mpg_data.drop(["id"],axis=1)
#separate mpg data into predictors and outcome variable
mpg_X = mpg_data.drop(labels='price', axis=1)
mpg_y = mpg_data['price']
# remove rows where the data is nan
not_null_sel = np.where(mpg_X.isna().sum(axis=1).values == 0)
mpg_X = mpg_X.values[not_null_sel]
mpg_y = mpg_y.values[not_null_sel]
# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
mpg_X,
mpg_y,
test_size=0.25,
random_state=42)
# Create RandomForestRegressor
mpg_forest = RandomForestRegressor(random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)
# Plot predicted MPG without error bars
plt.scatter(mpg_y_test, mpg_y_hat)
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
print(r2_score(mpg_y_test, mpg_y_hat))
# Calculate the variance
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train,
mpg_X_test)
# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
It seems to work but when the graphs are plotted, neither the error bar nor the prediction line appears:
Instead, as visible in the documentation, it should look like the picture here: http://contrib.scikit-learn.org/forest-confidence-interval/auto_examples/plot_mpg.html
You forget to add this line
plt.plot([5, 45], [5, 45], 'k--')
Your code should look like this
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
My code is very basic, just following the first lab from DeepLearning.ai's GAN specialization. However, my code does not have the same output, what is the reason for this. Sorry if this is just a silly mistake, this is my first experience with GANs. I begin by creating the Generator and Discriminator classes, my random noise function, and creating my models. I then run the training loop, but after 3 epochs, all of the outputs from the GAN are black.
import torch
from torch import nn
from tqdm.auto import tqdm
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import make_grid
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
torch.manual_seed(0)
def show_tensor_images(image_tensor,num_images=25,size=(1,28,28)):
image_unflat=image_tensor.detach().cpu().view(-1,*size)
image_grid=make_grid(image_unflat[:num_images],nrow=5)
plt.imshow(image_grid.permute(1,2,0).squeeze())
plt.show()
class Generator(nn.Module):
def __init__(self,z_dim):
super(Generator,self).__init__()
self.linear1=nn.Linear(z_dim,128)
self.bn1=nn.BatchNorm1d(128)
self.linear2=nn.Linear(128,256)
self.bn2=nn.BatchNorm1d(256)
self.linear3=nn.Linear(256,512)
self.bn3=nn.BatchNorm1d(512)
self.linear4=nn.Linear(512,1024)
self.bn4=nn.BatchNorm1d(1024)
self.linear5=nn.Linear(1024,784)
self.relu=nn.ReLU(True)
self.sigmoid=nn.Sigmoid()
def forward(self,x):
x=self.linear1(x)
x=self.bn1(x)
x=self.relu(x)
x=self.linear2(x)
x=self.bn2(x)
x=self.relu(x)
x=self.linear3(x)
x=self.bn3(x)
x=self.relu(x)
x=self.linear4(x)
x=self.bn4(x)
x=self.relu(x)
x=self.linear5(x)
x=self.sigmoid(x)
return(x)
def get_noise(n_samples,z_dim,device='cpu'):
return torch.randn(n_samples,z_dim,device=device)
class Discriminator(nn.Module):
def __init__(self):
super(Discriminator,self).__init__()
self.linear1=nn.Linear(784,512)
self.linear2=nn.Linear(512,256)
self.linear3=nn.Linear(256,128)
self.linear4=nn.Linear(128,1)
self.relu=nn.LeakyReLU(0.2,True)
def forward(self,x):
x=self.linear1(x)
x=self.relu(x)
x=self.linear2(x)
x=self.relu(x)
x=self.linear3(x)
x=self.relu(x)
x=self.linear4(x)
return(x)
criterion=nn.BCEWithLogitsLoss()
epochs=200
z_dim=64
display_step=500
batch_size=128
lr=0.00001
device='cuda'
dataloader=DataLoader(MNIST('.',download=True,transform=transforms.ToTensor()),batch_size=batch_size,shuffle=True)
gen=Generator(z_dim).to(device)
gen_opt=torch.optim.Adam(gen.parameters(),lr=lr)
disc=Discriminator().to(device)
disc_opt=torch.optim.Adam(disc.parameters(),lr=lr)
def get_disc_loss(gen,disc,criterion,real,num_images,z_dim,device):
noise=get_noise(num_images,z_dim,device=device)
gen_out=gen(noise)
disc_fake_out=disc(gen_out.detach())
fake_loss=criterion(disc_fake_out,torch.zeros_like(disc_fake_out))
disc_real_out=disc(real)
real_loss=criterion(disc_real_out,torch.zeros_like(disc_real_out))
disc_loss=(fake_loss+real_loss)/2
return(disc_loss)
def get_gen_loss(gen,disc,criterion,num_images,z_dim,device):
noise=get_noise(num_images,z_dim,device=device)
gen_out=gen(noise)
disc_out=disc(gen_out)
loss=criterion(disc_out,torch.ones_like(disc_out))
return loss
cur_step=0
mean_generator_loss=0
mean_discriminator_loss=0
gen_loss=False
error=False
for epoch in range(epochs):
for x,_ in tqdm(dataloader):
cur_batch_size=len(x)
x=x.view(cur_batch_size,-1).to(device)
disc_opt.zero_grad()
disc_loss=get_disc_loss(gen,disc,criterion,x,cur_batch_size,z_dim,device)
disc_loss.backward(retain_graph=True)
disc_opt.step()
gen_opt.zero_grad()
gen_loss=get_gen_loss(gen,disc,criterion,cur_batch_size,z_dim,device)
gen_loss.backward()
gen_opt.step()
mean_discriminator_loss+=disc_loss.item()/display_step
mean_generator_loss+=gen_loss.item()/display_step
if cur_step%display_step==0 and cur_batch_size>0:
print(f"Step {cur_step}: Generator loss: {mean_generator_loss}, discriminator loss: {mean_discriminator_loss}")
fake_noise = get_noise(cur_batch_size, z_dim, device=device)
fake = gen(fake_noise)
show_tensor_images(fake)
show_tensor_images(x)
mean_generator_loss = 0
mean_discriminator_loss = 0
cur_step += 1
Your discriminator loss is wrong. The labels for the real images should be 1 instead of 0.
Updated code:
def get_disc_loss(gen,disc,criterion,real,num_images,z_dim,device):
noise=get_noise(num_images,z_dim,device=device)
gen_out=gen(noise)
disc_fake_out=disc(gen_out.detach())
fake_loss=criterion(disc_fake_out,torch.zeros_like(disc_fake_out))
disc_real_out=disc(real)
real_loss=criterion(disc_real_out,torch.ones_like(disc_real_out))
disc_loss=(fake_loss+real_loss)/2
return(disc_loss)
The output image looks pretty good to me:
I built a Keras regressor using the following code:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as ny
import pandas
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
X = ny.array([[1,2], [3,4], [5,6], [7,8], [9,10]])
sc_X=StandardScaler()
X_train = sc_X.fit_transform(X)
Y = ny.array([3, 4, 5, 6, 7])
Y=ny.reshape(Y,(-1,1))
sc_Y=StandardScaler()
Y_train = sc_Y.fit_transform(Y)
N = 5
def brain():
#Create the brain
br_model=Sequential()
br_model.add(Dense(3, input_dim=2, kernel_initializer='normal',activation='relu'))
br_model.add(Dense(2, kernel_initializer='normal',activation='relu'))
br_model.add(Dense(1,kernel_initializer='normal'))
#Compile the brain
br_model.compile(loss='mean_squared_error',optimizer='adam')
return br_model
def predict(X,sc_X,sc_Y,estimator):
prediction = estimator.predict(sc_X.fit_transform(X))
return sc_Y.inverse_transform(prediction)
estimator = KerasRegressor(build_fn=brain, epochs=1000, batch_size=5,verbose=0)
# print "Done"
estimator.fit(X_train,Y_train)
prediction = estimator.predict(X_train)
print predict(X,sc_X,sc_Y,estimator)
X_test = ny.array([[1.5,4.5], [7,8], [9,10]])
print predict(X_test,sc_X,sc_Y,estimator)
The issue I face is that the code is not predicting the same value (for example, it predicting 6.64 for [9,10] in the first prediction (X) and 6.49 for [9,10] in the second prediction (X_test) )
The full output is this:
[2.9929883 4.0016675 5.0103474 6.0190268 6.6434317]
[3.096634 5.422326 6.4955378]
Why do I get different values and how do I resolve them?
The problem lies in this line of code:
prediction = estimator.predict(sc_X.fit_transform(X))
You are fitting a new scaler every time when you predict values for new data. This is where differences come from. Try:
prediction = estimator.predict(sc_X.transform(X))
In this case, you use a pretrained scaler.
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
import numpy as np
import random
from sklearn.svm import SVC
X=np.random.rand(1000,2)
Y=[random.randint(0,1) for x in range(0,1000)]
svm=BaggingClassifier(SVC(kernel='rbf', random_state=123, gamma=.000001, C=100000, class_weight='balanced'), max_samples=1/5.0, n_estimators=5, n_jobs=-1,random_state=123)
classfier=svm.fit(X,Y)
print(len(svm.estimators_samples_))
print(len(svm.estimators_samples_[0]))# here I expect 0.05*400 samples. but the result is 1000.
In this code, I try to apply BaggingClassifier with SVM. Normally as discussed in the documentation of sckitlearn, the max_samples fix the maximal number of samples to be used for each estimators. However, I remark that each estimator (n_estimators=5) take all the dataset!!! Is it a bug ?
svm.estimators_samples_[0] will return an array equal to the length of the data. This array is populated with boolean values, those values equal to True are the data points used in the estimator (in terms of index value).
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
import numpy as np
import random
from sklearn.svm import SVC
X=np.random.rand(1000,2)
Y=[random.randint(0,1) for x in range(0,1000)]
svm=BaggingClassifier(SVC(kernel='rbf', random_state=123, gamma=.000001, C=100000, class_weight='balanced'), max_samples=1/5.0, n_estimators=5, n_jobs=-1,random_state=123)
classfier=svm.fit(X,Y)
print(len([i for i in svm.estimators_samples_[0] if i == True]))
Running the above code I get:
181