scikit-learn mlxtend EnsembleVoteClassifier with sample_weights - machine-learning

I am trying to fit an EnsembleVoteClassifier according to mlxtend documentation
For normal grid.fit I can use fit_params to set sample_weight, but with the VotingClassifier it does not work. How can this be solved?
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data[:, :], iris.target
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
'''Creating a feature-selection-classifier pipeline'''
sfs1 = SequentialFeatureSelector(clf1,
k_features=4,
forward=True,
floating=False,
scoring='accuracy',
verbose=0,
cv=0)
clf1_pipe = Pipeline([('sfs', sfs1),
('logreg', clf1)])
eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3],
voting='soft')
params = {'pipeline__sfs__k_features': [1, 2, 3],
'pipeline__logreg__C': [1.0, 100.0],
'randomforestclassifier__n_estimators': [20, 200]}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
sample_weights = [1] * len(iris.target)
grid.fit(iris.data, iris.target,**{'pipeline__logreg__sample_weight': sample_weights})

Related

Semantic segmentation results is all black all bits are the same no mask

When I use the code below to perform Semantic semgentation on my owndataset(40 images) and annotations(1 class(myface) and the annotations in cocojson format) I got just black image no mask and all bits are the same and the model accuracy in all epochs is 67.87% but loss is going down in each epoch :
import cv2
import os
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.models import Model
from pycocotools.coco import COCO
from tensorflow.keras.applications import VGG16
from sklearn.utils import compute_sample_weight
folder_path = "D:\\ImageClassification\\face_semantic_segmentation\\dataset"
filenames = os.listdir(folder_path)
images = []
for filename in filenames:
img = cv2.imread(os.path.join(folder_path, filename))
img = cv2.resize(img, (256, 256))
img = np.array(img)
images.append(img)
x_train = np.array(images)
with open("D:\\ImageClassification\\face_semantic_segmentation\\annotations.json") as f:
coco = json.load(f)
annotations = coco['annotations']
masks = {}
for annotation in annotations:
image_id = annotation['image_id']
if image_id not in masks:
masks[image_id] = []
masks[image_id].append(annotation['segmentation'])
resized_masks = []
for image_id, mask in masks.items():
mask_img = np.zeros((720, 1280), dtype=np.uint8)
for segmentation in mask:
poly = np.array(segmentation).reshape((-1, 1, 2)).astype(np.int32)
cv2.fillPoly(mask_img, [poly], 1)
mask_img = cv2.resize(mask_img, (256, 256))
mask_img = np.stack([mask_img] * 3, axis=-1)
resized_masks.append(mask_img)
y_train = np.array(resized_masks)
y_train.shape
import matplotlib.pyplot as plt
import numpy as np
mask = y_train[4]
mask = np.sum(mask, axis=-1)
plt.imshow(mask)
plt.show()
from segmentation_models import Unet
from segmentation_models import get_preprocessing
from segmentation_models.losses import bce_jaccard_loss
from segmentation_models.metrics import iou_score
from tensorflow.keras.models import Model
BACKBONE = 'resnet50'
preprocess_input = get_preprocessing(BACKBONE)
model = Unet(BACKBONE,classes=2,input_shape=(256,256, 3), encoder_weights='imagenet',activation='sigmoid')
x_train = preprocess_input(x_train)
x = model.layers[-1].output
x = Conv2D(3, (1, 1), activation='sigmoid')(x)
model = Model(inputs=model.input, outputs=x)
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
model.fit(x=x_train,y=y_train,batch_size=32,epochs=50)

How to make subplots consisting of 6 charts to charts from the scikitplot library

I have six models and want to evaluate them with a ROC chart
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
NBC = GaussianNB()
LRE = LogisticRegression(solver='lbfgs')
GBC = GradientBoostingClassifier()
RFC = RandomForestClassifier()
LGBM = LGBMClassifier()
CBC = CatBoostClassifier(verbose=0, n_estimators=100)
classifiers = [NBC,LRE,GBC,RFC,LGBM,CBC]
for cls in classifiers:
cls.fit(X_train, y_train)
now I'm making charts for this
import scikitplot as skplt
for cls in classifiers:
skplt.metrics.plot_roc(y_test, cls.predict_proba(X_test),figsize=(6, 3),title=type(cls).__name__)
plt.tight_layout()
plt.show()
But I want subplot charts! Unfortunately, this does not work out. I need this solution for a whole series of problems with scikitplot charts.
classifiers = [NBC,LRE,GBC,RFC,LGBM,CBC]
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
plt.figure(figsize=(15,7))
grid = plt.GridSpec(2, 3, wspace=0.3, hspace=0.4)
for i in range(6):
col, row = i%3,i//3
ax = plt.subplot(grid[row,col])
ax.title.set_color('blue')
#model = classifiers[i]
#skplt.metrics.plot_roc(y_test, model.predict_proba(X_test),figsize=(6, 3),title=type(cls).__name__)
plt.tight_layout()
plt.show()
you need to specify the ax argument in skplt.metrics.plot_roc
plt.figure(figsize=(15,7))
grid = plt.GridSpec(2, 3, wspace=0.3, hspace=0.4)
for i in range(6):
col, row = i%3,i//3
ax = plt.subplot(grid[row,col])
ax.title.set_color('blue')
model = classifiers[i]
skplt.metrics.plot_roc(y_test, model.predict_proba(X_test), ax=ax, title=type(cls).__name__)
plt.show()

How to predict the outcome of a new patient using this SVC ML model trained on a dataset

Data description:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Alternatively X and y can be obtained directly from the frame attribute:
X = df.drop('OUTCOME', axis=1)
y = df['OUTCOME']
numeric_features = ['Age', 'PCT' , 'CURB 65' , 'pO2' ]
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['Sex', 'CXR' , 'Hospitalisation in last 3 months' , 'ICU>72hrs', 'Blood']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn import linear_model
from sklearn.svm import SVC
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Accuracy of the model is 1.0
How to enter new patient's parameter to get a prediction for that patient? New patient is not from this dataset.
i tried to enter new patient information as follows
New_patient = [70,1,0,2,2,10,40,2,20,68,1,2,0,1,1]
clf.predict([New_patient])
output: ValueError: Specifying the columns using strings is only supported for pandas DataFrames

ROC curve is not acutally a curve

I have plotted few ROC curve to calculate the AUC. I am having ROC curve is actually doesn't plots like a curve. I have attached the images for better understanding. If any one can tell me what is wrong in there. I will be obliged. This is one kind of plot I am getting
This is the another type
However I am not getting a curve like this one.
This is the link to my dataset
https://drive.google.com/open?id=1luj8d863_IOA36cQTo772GEWgUsrXlbJ
I will thankful if anyone can help me understand the problem if any or if my curves are correct then why it is not actually in a curve like structure
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from keras.layers import Dense, Input
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Model,Sequential
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
seed = 7
np.random.seed(seed)
dataset = pd.read_csv('dataset/prostate.csv')
labels = dataset.values[:,-1]
features_set = dataset.iloc[:,0:12600]
oversampler = SMOTE(random_state=0)
oversampler_feature_set, oversampler_labels = oversampler.fit_sample(features_set,labels)
feature_df = pd.DataFrame(oversampler_feature_set)
labels_df = pd.DataFrame(oversampler_labels)
scalar = MinMaxScaler()
scaled_data = scalar.fit_transform(feature_df)
pca = PCA(n_components=30)
pca_data = pd.DataFrame(pca.fit_transform(scaled_data))
recreated_df = pd.concat([pca_data,labels_df], axis=1)
train, test = train_test_split(recreated_df,test_size=0.2)
X_train = train.values[:,0:30]
Y_train = train.values[:,-1]
X_test = test.values[:,0:30]
y_test = test.values[:,-1]
def my_model():
model = Sequential()
model.add(Dense(20, input_dim=30,activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
return model
estimator = KerasClassifier(build_fn=my_model, epochs=1000, batch_size=10, shuffle=True,verbose=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator,X_train,Y_train, cv=kfold)
results.mean()
estimator.fit(X_train,Y_train)
y_pred = estimator.predict(X_test).ravel()
sensitivity, specificity, thresholds_keras = roc_curve(y_test,y_pred,pos_label=2)
auc_keras = auc(sensitivity,specificity)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(sensitivity, specificity, label='Keras (area =:.3f})'.format(auc_keras))
plt.xlabel('Specificity')
plt.ylabel('Sensitivity')
plt.title('Prostate')
plt.legend(loc='best')
plt.show()

Random Forest on Categorical Data with low accuracy

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)

Resources