How to make subplots consisting of 6 charts to charts from the scikitplot library - subplot

I have six models and want to evaluate them with a ROC chart
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
NBC = GaussianNB()
LRE = LogisticRegression(solver='lbfgs')
GBC = GradientBoostingClassifier()
RFC = RandomForestClassifier()
LGBM = LGBMClassifier()
CBC = CatBoostClassifier(verbose=0, n_estimators=100)
classifiers = [NBC,LRE,GBC,RFC,LGBM,CBC]
for cls in classifiers:
cls.fit(X_train, y_train)
now I'm making charts for this
import scikitplot as skplt
for cls in classifiers:
skplt.metrics.plot_roc(y_test, cls.predict_proba(X_test),figsize=(6, 3),title=type(cls).__name__)
plt.tight_layout()
plt.show()
But I want subplot charts! Unfortunately, this does not work out. I need this solution for a whole series of problems with scikitplot charts.
classifiers = [NBC,LRE,GBC,RFC,LGBM,CBC]
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
plt.figure(figsize=(15,7))
grid = plt.GridSpec(2, 3, wspace=0.3, hspace=0.4)
for i in range(6):
col, row = i%3,i//3
ax = plt.subplot(grid[row,col])
ax.title.set_color('blue')
#model = classifiers[i]
#skplt.metrics.plot_roc(y_test, model.predict_proba(X_test),figsize=(6, 3),title=type(cls).__name__)
plt.tight_layout()
plt.show()

you need to specify the ax argument in skplt.metrics.plot_roc
plt.figure(figsize=(15,7))
grid = plt.GridSpec(2, 3, wspace=0.3, hspace=0.4)
for i in range(6):
col, row = i%3,i//3
ax = plt.subplot(grid[row,col])
ax.title.set_color('blue')
model = classifiers[i]
skplt.metrics.plot_roc(y_test, model.predict_proba(X_test), ax=ax, title=type(cls).__name__)
plt.show()

Related

How to predict the outcome of a new patient using this SVC ML model trained on a dataset

Data description:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Alternatively X and y can be obtained directly from the frame attribute:
X = df.drop('OUTCOME', axis=1)
y = df['OUTCOME']
numeric_features = ['Age', 'PCT' , 'CURB 65' , 'pO2' ]
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['Sex', 'CXR' , 'Hospitalisation in last 3 months' , 'ICU>72hrs', 'Blood']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn import linear_model
from sklearn.svm import SVC
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Accuracy of the model is 1.0
How to enter new patient's parameter to get a prediction for that patient? New patient is not from this dataset.
i tried to enter new patient information as follows
New_patient = [70,1,0,2,2,10,40,2,20,68,1,2,0,1,1]
clf.predict([New_patient])
output: ValueError: Specifying the columns using strings is only supported for pandas DataFrames

ROC curve is not acutally a curve

I have plotted few ROC curve to calculate the AUC. I am having ROC curve is actually doesn't plots like a curve. I have attached the images for better understanding. If any one can tell me what is wrong in there. I will be obliged. This is one kind of plot I am getting
This is the another type
However I am not getting a curve like this one.
This is the link to my dataset
https://drive.google.com/open?id=1luj8d863_IOA36cQTo772GEWgUsrXlbJ
I will thankful if anyone can help me understand the problem if any or if my curves are correct then why it is not actually in a curve like structure
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from keras.layers import Dense, Input
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Model,Sequential
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
seed = 7
np.random.seed(seed)
dataset = pd.read_csv('dataset/prostate.csv')
labels = dataset.values[:,-1]
features_set = dataset.iloc[:,0:12600]
oversampler = SMOTE(random_state=0)
oversampler_feature_set, oversampler_labels = oversampler.fit_sample(features_set,labels)
feature_df = pd.DataFrame(oversampler_feature_set)
labels_df = pd.DataFrame(oversampler_labels)
scalar = MinMaxScaler()
scaled_data = scalar.fit_transform(feature_df)
pca = PCA(n_components=30)
pca_data = pd.DataFrame(pca.fit_transform(scaled_data))
recreated_df = pd.concat([pca_data,labels_df], axis=1)
train, test = train_test_split(recreated_df,test_size=0.2)
X_train = train.values[:,0:30]
Y_train = train.values[:,-1]
X_test = test.values[:,0:30]
y_test = test.values[:,-1]
def my_model():
model = Sequential()
model.add(Dense(20, input_dim=30,activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
return model
estimator = KerasClassifier(build_fn=my_model, epochs=1000, batch_size=10, shuffle=True,verbose=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator,X_train,Y_train, cv=kfold)
results.mean()
estimator.fit(X_train,Y_train)
y_pred = estimator.predict(X_test).ravel()
sensitivity, specificity, thresholds_keras = roc_curve(y_test,y_pred,pos_label=2)
auc_keras = auc(sensitivity,specificity)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(sensitivity, specificity, label='Keras (area =:.3f})'.format(auc_keras))
plt.xlabel('Specificity')
plt.ylabel('Sensitivity')
plt.title('Prostate')
plt.legend(loc='best')
plt.show()

Random Forest on Categorical Data with low accuracy

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)

Using an autoencoder to reduce dimensionality

Here is my version of an autoencoder written using PyTorch :
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
sns.set_style("darkgrid")
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
sns.set_style("darkgrid")
import torch
%matplotlib inline
f = []
f.append(np.random.uniform(0,10,(1 , 10)).flatten())
f.append(np.random.uniform(10,20,(1 , 10)).flatten())
f.append(np.random.uniform(20,30,(1 , 10)).flatten())
x_data = torch.FloatTensor(np.array(f))
x_data
dimensions_input = 10
hidden_layer_nodes = 5
output_dimension = 10
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = torch.nn.Linear(dimensions_input,hidden_layer_nodes)
self.sigmoid = torch.nn.Sigmoid()
self.linear2 = torch.nn.Linear(hidden_layer_nodes,output_dimension)
def forward(self, x):
l_out1 = self.linear(x)
l_out2 = self.sigmoid(l_out1)
y_pred = self.linear2(l_out2)
return y_pred
model = Model()
criterion = torch.nn.MSELoss(size_average = False)
optim = torch.optim.SGD(model.parameters(), lr = 0.00001)
def train_model():
y_data = x_data.clone()
for i in range(150000):
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
if i % 5000 == 0:
print(loss)
optim.zero_grad()
loss.backward()
optim.step()
Using x_data.clone() I train the network to learn a feature representation of the input data.
I'm attempting to generate hidden layer weights that match the dimensionality of rows of the input data so that each vector of x_data has a corresponding encoding. But the hidden later is of is a vector of size 5. How to change this network so that a matrix is generated that represents a reduced dimensionality of the input data ?

scikit-learn mlxtend EnsembleVoteClassifier with sample_weights

I am trying to fit an EnsembleVoteClassifier according to mlxtend documentation
For normal grid.fit I can use fit_params to set sample_weight, but with the VotingClassifier it does not work. How can this be solved?
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data[:, :], iris.target
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
'''Creating a feature-selection-classifier pipeline'''
sfs1 = SequentialFeatureSelector(clf1,
k_features=4,
forward=True,
floating=False,
scoring='accuracy',
verbose=0,
cv=0)
clf1_pipe = Pipeline([('sfs', sfs1),
('logreg', clf1)])
eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3],
voting='soft')
params = {'pipeline__sfs__k_features': [1, 2, 3],
'pipeline__logreg__C': [1.0, 100.0],
'randomforestclassifier__n_estimators': [20, 200]}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
sample_weights = [1] * len(iris.target)
grid.fit(iris.data, iris.target,**{'pipeline__logreg__sample_weight': sample_weights})

Resources