How to plot decision boundaries for Random Forest classifier - random-forest

How to go about plotting the decision boundaries for a Random Forest analysis with 10 classes?
I get the error:
ValueError: X has 2 features, but RandomForestClassifier is expecting
240 features as input.
Can you help me get the decision boundaries for the 10 classes if possible? Thanks for your time!
Here is my code:
from sklearn.datasets import make_classification
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
f, (ax1,ax2) = plt.subplots(nrows=1, ncols=2,figsize=(20,8))
# Generate noisy Data
num_trainsamples = 500
num_testsamples = 50
X_train,y_train = make_classification(n_samples=num_trainsamples,
n_features=240,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=9,
flip_y=0.2,
#weights=[0.5,0.5],
random_state=17)
X_test,y_test = make_classification(n_samples=50,
n_features=num_testsamples,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=10,
flip_y=0.2,
#weights=[0.5,0.5],
random_state=17)
model = RandomForestClassifier()
parameter_space = {
'n_estimators': [10,50,100],
'criterion': ['gini', 'entropy'],
'max_depth': np.linspace(10,50,11),
}
clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
my_model = clf.fit(X_train, y_train)
# define bounds of the domain
min1, max1 = X_train[:, 0].min()-1, X_train[:, 0].max()+1
min2, max2 = X_train[:, 1].min()-1, X_train[:, 1].max()+1
# define the x and y scale
x1grid = np.arange(min1, max1, 0.1)
x2grid = np.arange(min2, max2, 0.1)
# create all of the lines and rows of the grid
xx, yy = np.meshgrid(x1grid, x2grid)
# flatten each grid to a vector
r1, r2 = xx.flatten(), yy.flatten()
r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
# horizontal stack vectors to create x1,x2 input for the model
grid = np.hstack((r1,r2))
yhat = clf.predict(grid)
# reshape the predictions back into a grid
zz = yhat.reshape(xx.shape)
# plot the grid of x, y and z values as a surface
plt.contourf(xx, yy, zz, cmap='Paired')
# create scatter plot for samples from each class
for class_value in range(2):
# get row indexes for samples with this class
row_ix = np.where(y == class_value)
# create scatter of these samples
plt.scatter(X_train[row_ix, 0], X_train[row_ix, 1], cmap='Paired')

Related

The number of classes has to be greater than one; got 1 class in SVM

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = np.linspace(-5.0, 5.0, 100)
y = np.sqrt(10**2 - x**2)
y=np.hstack([y,-y])
x=np.hstack([x,-x])
x1 = np.linspace(-5.0, 5.0, 100)
y1 = np.sqrt(5**2 - x1**2)
y1=np.hstack([y1,-y1])
x1=np.hstack([x1,-x1])
plt.scatter(y,x)
plt.scatter(y1,x1)
# print(plt.show())
import pandas as pd
df1 =pd.DataFrame(np.vstack([y,x]).T,columns=['X1','X2'])
df1['Y']=0
df2 =pd.DataFrame(np.vstack([y1,x1]).T,columns=['X1','X2'])
df2['Y']=1
df1.merge(df2)
# We need to find components for the Polynomical Kernel
#X1,X2,X1_square,X2_square,X1*X2
df1['X1_Square']= df1['X1']**2
df1['X2_Square']= df1['X2']**2
df1['X1*X2'] = (df1['X1'] *df1['X2'])
# print(df1.head())
### Independent and Dependent features
X = df1[['X1','X2','X1_Square','X2_Square','X1*X2']]
y = df1['Y']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25,
random_state = 0)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
classifier = SVC(kernel="linear")
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)
ValueError: The number of classes has to be greater than one; got 1 class
I Don't know How to resolve this one error.May be there is error in merge of two data frames or I to append df1 and df2 but i tried it that doesn't work.
The error is because y has only one value of 0 because your code logic is at line y = df1['Y'].
You can replace line df1.merge(df2) code like this:
df1 = pd.concat([df1,df2])

How to forecast actual future values using XGBoost?

So I have a solar Irradiation dataset having around 61000+ rows & 2 columns. I have made the model using XGBoost to predict the future values.
I have splitted the data in 2 parts train and test and trained the model accordingly. Furthermore, I have made the predictions on the test data set. Everything is going fine. But now I want to predict the actual forecast How can I do that ??
import os
import pandas as pd
import numpy as np
import xgboost
import matplotlib.pyplot as plt
from xgboost import plot_importance
from sklearn import metrics
# Dataset
df=pd.read_csv('Readings_last_7yr.csv')
df.index = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S')
## Copied the dataset
df2 = df.copy()
del df2['Date']
## Test Train Split
from pandas import read_csv
from matplotlib import pyplot
# series = read_csv('sunspots.csv', header=0, index_col=0)
X = df2
train_size = int(len(X) * 0.75)
train, test = X[0:train_size], X[train_size:len(X)]
print('Observations: %d' % (len(X)))
print('Training Observations: %d' % (len(train)))
print('Testing Observations: %d' % (len(test)))
pyplot.plot(train)
pyplot.plot(test)
pyplot.show()
## Creating Features
def create_features(df, target_variable):
df['date'] = df.index
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.weekofyear
X = df[['hour','dayofweek','quarter','month','year',
'dayofyear','dayofmonth','weekofyear']]
if target_variable:
y = df[target_variable]
return X, y
return X
## METRICS
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def timeseries_evaluation_metrics_func(y_true, y_pred):
print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
print(f'MAPE is : {mean_absolute_percentage_error(y_true, y_pred)}')
print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')
train_copy = train.copy()
test_copy = test.copy()
trainX, trainY = create_features(train_copy, target_variable='Irr')
testX, testY = create_features(test_copy, target_variable='Irr')
xgb = XGBRegressor(objective= 'reg:linear', n_estimators=1000)
xgb
xgb.fit(trainX, trainY,
eval_set=[(trainX, trainY), (testX, testY)],
early_stopping_rounds=50,
verbose=False)
# Predictions
predicted_results = xgb.predict(testX)
# Metrics
timeseries_evaluation_metrics_func(testY, predicted_results)
# Plotting graph for test and Predicted
plt.figure(figsize=(13,8))
plt.plot(list(testY))
plt.plot(list(predicted_results))
plt.title("Actual vs Predicted")
plt.ylabel("Irr")
plt.legend(('Actual','predicted'))
plt.show()
# Making graph of predicted on the whole dataframe
test['Prediction'] = predicted_results
Irr_all = pd.concat([test, train], sort=False)
Irr_all = Irr_all.rename(columns={'Irradiation':'Original_Value'})
Overview_Complete_Data_And_Prediction = Irr_all[['Irr','Prediction']].plot(figsize=(12, 5))
I am getting the results as:
So now I want to predict future values from the year 2021 till 2023.
FUTURE PREDICTION
dti = pd.date_range("2021-01-01 00:30:00", periods=20000, freq="H")
df_future_dates = pd.DataFrame(dti, columns = ['Date'])
df_future_dates['Irr'] = np.nan
df_future_dates.index = pd.to_datetime(df_future_dates['Date'], format='%Y-%m-%d %H:%M:%S')
df_future_dates_copy = df_future_dates.copy()
testX_future, testY_future = create_features(df_future_dates, target_variable='Irr')
xgb = XGBRegressor(objective= 'reg:linear', n_estimators=1000)
xgb
## Now here I have used train and test from above
xgb.fit(trainX, trainY,
eval_set=[(trainX, trainY), (testX, testY)],
early_stopping_rounds=50,
verbose=False)
predicted_results_future = xgb.predict(testX_future)
# Graph
plt.figure(figsize=(13,8))
plt.plot(list(predicted_results_future))
plt.title("Predicted")
plt.ylabel("Irr")
plt.legend(('predicted'))
plt.show()
df_future_dates_copy['Prediction'] = predicted_results_future
Irr_all_future = pd.concat([df2, df_future_dates_copy], sort=False)
# Future Graph
Overview_Complete_Data_And_Prediction_future = Irr_all_future[['Irr','Prediction']].plot(figsize=(15, 5))

How to plot Confusion Matrix

Please I would love some assistance to plot a confusion matrix from my model. Code displayed below:
import os
import glob
from sklearn.model_selection import train_test_split
import shutil
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from my_utils import create_generators
from CNN_models import amazon_model
import tensorflow as tf
import matplotlib.pyplot as plt
if name=="main":
path_to_train = "data\\train"
path_to_val = "data\\val"
path_to_test = "data\\test"
batch_size = 128
epochs = 5
lr = 0.0001
train_generator, val_generator, test_generator = create_generators(batch_size, path_to_train, path_to_val, path_to_test)
nbr_classes = train_generator.num_classes
TRAIN=True
TEST=False
if TRAIN:
path_to_save_model = './Models'
ckpt_saver = ModelCheckpoint(
path_to_save_model,
monitor="val_accuracy",
mode='max',
save_best_only=True,
save_freq='epoch',
verbose=1
)
early_stop = EarlyStopping(monitor="val_accuracy", patience=5)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
csv_logger = tf.keras.callbacks.CSVLogger('first_model_training.log', separator=",", append=False)
model = amazon_model(nbr_classes)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr, amsgrad=True)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
history = model.fit(train_generator,
epochs=epochs,
batch_size=batch_size,
validation_data=val_generator,
callbacks=[ckpt_saver, early_stop, tensorboard_callback, csv_logger]
)
acc = history.history['accuracy']
print(acc)
model.save("first_model.h5")
from matplotlib.pyplot import figure
figure(figsize=(8, 6))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.savefig('./plots/accuracy', dpi=200)
plt.show()
figure(figsize=(8, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./plots/loss', dpi=200)
plt.show()
if TEST:
model = tf.keras.models.load_model('./Models')
model.summary()
print("Evaluating validation set: ")
model.evaluate(val_generator)
print("Evaluating test set: ")
model.evaluate(test_generator)
Sorry that it may be a bit of a newbie question but I would love to know what I need to add to the above code to make it plot a confusion matrix for my after it runs.
I'm able to plot the graphs of both accuracy and loss for a few epochs, but I want to include Confusion Matrix before running for more epochs. Here are the plots already obtained:
accuracy plot
loss plot
Try this
The basic concept is to get prediction results from your model using X_test and then to compare these predictions to the real y_test results.
# 'Fake' and 'Real' are your dependent features for your classification use case,
# where Fake == 0 and Real == 1. It is important that you use this form for the matrix.
('Fake', 'Real') == (0, 1)
('Fake', 'Real') == (0, 1)
# Data handling
import pandas as pd
# Exploratory Data Analysis & Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
# Model improvement and Evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
# Plotting confusion matrix
matrix = pd.DataFrame((metrics.confusion_matrix(y_test, y_prediction)),
('Fake', 'Real'),
('Fake', 'Real'))
print(matrix)
# Visualising confusion matrix
plt.figure(figsize = (16,14),facecolor='white')
heatmap = sns.heatmap(matrix, annot = True, annot_kws = {'size': 20}, fmt = 'd', cmap = 'YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation = 0, ha = 'right', fontsize = 18, weight='bold')
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation = 0, ha = 'right', fontsize = 18, weight='bold')
plt.title('Confusion Matrix\n', fontsize = 18, color = 'darkblue')
plt.ylabel('True label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
plt.show()
If you need further help email us here: theanalyticsolutions#gmail.com

How to extract coefficients from fitted pipeline for penalized logistic regression?

I have a set of training data that consists of X, which is a set of n columns of data (features), and Y, which is one column of target variable.
I am trying to train my model with logistic regression using the following pipeline:
pipeline = sklearn.pipeline.Pipeline([
('logistic_regression', LogisticRegression(penalty = 'none', C = 10))
])
My goal is to obtain the values of each of the n coefficients corresponding to the features, under the assumption of a linear model (y = coeff_0 + coeff_1*x1 + ... + coeff_n*xn).
What I tried was to train this pipeline on my data with model = pipeline.fit(X, Y). So I think that I now have the model that contains the coefficients that I want. However, I don't know how to access them. I'm looking for something like mode.best_params_('logistic_regression').
Does anyone know how to extract the fitted coefficients from a model like this?
Have a look at the scikit-learn documentation for Pipeline, this example is inspired by it:
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
# generate some data to play with
X, y = make_classification(n_informative=5, n_redundant=0, random_state=42)
# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear')
anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])
anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
# access coefficients
print(anova_svm['svc'].coef_)
model.coef_ does the job, .best_params_ is usualy associated with GridSearch, i.e. hyperparameter optimization.
In your specific case try: model['logistic_regression'].coefs_.
Example to get the coefs from a pipeline.
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
X, y = load_iris(return_X_y=True)
pipeline = Pipeline([('lr', LogisticRegression(penalty = 'l2',
C = 10))])
pipeline.fit(X, y)
pipeline['lr'].coef_
array([[-0.42923513, 2.08235619, -4.28084811, -1.97174699],
[ 1.06321671, -0.08077595, -0.46911772, -2.3221883 ],
[-0.63398158, -2.00158024, 4.74996583, 4.29393529]])
here is how to visualize the coefficients and measure model accuracy. I used the baby weight and height and gestation period to predict preterm
pipeline = Pipeline([('lr', LogisticRegression(penalty='l2',C=10))])
scaler=StandardScaler()
#X=np.array(df['gestation_wks']).reshape(-1,1)
X=scaler.fit_transform(df[['bwt_lbs','height_ft','gestation_wks']])
y=np.array(df['PreTerm'])
X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=123)
pipeline.fit(X_train,y_train)
y_pred_prob=pipeline.predict_proba(X_test)
predictions=pipeline.predict(X_test)
print(predictions)
sns.countplot(x=predictions, orient='h')
plt.show()
#print(predictions[:,0])
print(pipeline['lr'].coef_)
print(pipeline['lr'].intercept_)
print('Coefficients close to zero will contribute little to the end result')
num_err = np.sum(y != pipeline.predict(X))
print("Number of errors:", num_err)
def my_loss(y,w):
s = 0
for i in range(y.size):
# Get the true and predicted target values for example 'i'
y_i_true = y[i]
y_i_pred = w[i]
s = s + (y_i_true - y_i_pred)**2
return s
print("Loss:",my_loss(y_test,predictions))
fpr, tpr, threshholds = roc_curve(y_test,y_pred_prob[:,1])
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
accuracy=round(pipeline['lr'].score(X_train, y_train) * 100, 2)
print("Model Accuracy={accuracy}".format(accuracy=accuracy))
cm=confusion_matrix(y_test,predictions)
print(cm)

How to encode categorical data for use with Semi-supervised algorithm LabelPropagation

I am attempting to use the anneal.arff dataset with Python scikit-learn's semisupervised algorithm LabelPropagation. The anneal dataset is categorical data, so I preprocessed it so that the output class for each item of instance
looks like [0. 0. 1. 0. 0.]. This is a numeric list that encodes the output class
as 5 possible values with 0's everywhere, and 1. in the position of the corresponding class. This is what I would expect.
For semi-supervised learning, most of the training data must be unlabeled, so
I modified the training set so that the unlabeled data has output [-1, -1, -1, -1, -1]. I previously tried just using -1, but the code emits the same error as shown below.
I train the classifier as follows, Y_train includes labeled and "unlabeled" data:
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X, Y_train)
I receive the error shown below after calling the fit method:
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\semi_supervised\label_propagation.py", line 221, in fit
X, y = check_X_y(X, y)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 526, in check_X_y
y = column_or_1d(y, warn=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 562, in column_or_1d
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (538, 5)
This suggests that something is wrong with the shape of my Y_train list,
but this is the correct shape. What am I doing wrong?
Can LabelPropagation take as training data in this form, or does it only
accept unlabeled data as a scalar -1?
--- edit ---
Here is the code that generates the error. I'm sorry about the confusion over algorithms--I want to use both LabelSpreading and LabelPropagation, and choosing one or the other doesn't fix this error.
from scipy.io import arff
import pandas as pd
import numpy as np
import math
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from copy import deepcopy
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
f = "../../Documents/UCI/anneal.arff"
dataAsRecArray, meta = arff.loadarff(f)
dataset_raw = pd.DataFrame.from_records(dataAsRecArray)
dataset = pd.get_dummies(dataset_raw)
class_names = [col for col in dataset.columns if 'class_' in col]
print (dataset.shape)
number_of_output_columns = len(class_names)
print (number_of_output_columns)
def run(name, model, dataset, percent):
# Split-out validation dataset
array = dataset.values
X = array[:, 0:-number_of_output_columns]
Y = array[:, -number_of_output_columns:]
validation_size = 0.40
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
num_samples = len(Y_train)
num_labeled_points = math.floor(percent*num_samples)
indices = np.arange(num_samples)
unlabeled_set = indices[num_labeled_points:]
Y_train[unlabeled_set] = [-1, -1, -1, -1, -1]
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X_train, Y_train)
"""
predicted_labels = lp_model.transduction_[unlabeled_set]
print(predicted_labels[:10])
"""
if __name__ == "__main__":
#percentages = [0.1, 0.2, 0.3, 0.4]
percentages = [0.1]
models = []
models.append(('LS', LabelSpreading()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
for percent in percentages:
run(name, model, dataset, percent)
print ("bye")
Your Y_train has shape (538, 5) but should be 1d. LabelPropagation doesn't support multi-label or multi-output multi-class right now.
The error message could be more informative, though :-/

Resources