Anomalies Detection by DBSCAN - machine-learning

I am using DBSCAN on my training datatset in order to find outliers and remove those outliers from the dataset before training model. I am using DBSCAN on my train rows 7697 with 8 columns.Here is my code
from sklearn.cluster import DBSCAN
X = StandardScaler().fit_transform(X_train[all_features])
model = DBSCAN(eps=0.3 , min_samples=10).fit(X)
print (model)
X_train_1=X_train.drop(X_train[model.labels_==-1].index).copy()
X_train_1.reset_index(drop=True,inplace=True)
Q-1 Out of these 7 some are discrete and some are continuous , is it ok to scale discrete and continuous both or just continuous?
Q-2 Do i need to map cluster to test data as it learned from training?

DBSCAN will handle those outliers for you. That's what is was built for. See the example below and post back if you have additional questions.
import seaborn as sns
import pandas as pd
titanic = sns.load_dataset('titanic')
titanic = titanic.copy()
titanic = titanic.dropna()
titanic['age'].plot.hist(
bins = 50,
title = "Histogram of the age variable"
)
from scipy.stats import zscore
titanic["age_zscore"] = zscore(titanic["age"])
titanic["is_outlier"] = titanic["age_zscore"].apply(
lambda x: x <= -2.5 or x >= 2.5
)
titanic[titanic["is_outlier"]]
ageAndFare = titanic[["age", "fare"]]
ageAndFare.plot.scatter(x = "age", y = "fare")
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
ageAndFare = scaler.fit_transform(ageAndFare)
ageAndFare = pd.DataFrame(ageAndFare, columns = ["age", "fare"])
ageAndFare.plot.scatter(x = "age", y = "fare")
from sklearn.cluster import DBSCAN
outlier_detection = DBSCAN(
eps = 0.5,
metric="euclidean",
min_samples = 3,
n_jobs = -1)
clusters = outlier_detection.fit_predict(ageAndFare)
clusters
from matplotlib import cm
cmap = cm.get_cmap('Accent')
ageAndFare.plot.scatter(
x = "age",
y = "fare",
c = clusters,
cmap = cmap,
colorbar = False
)

Related

How to plot decision boundaries for Random Forest classifier

How to go about plotting the decision boundaries for a Random Forest analysis with 10 classes?
I get the error:
ValueError: X has 2 features, but RandomForestClassifier is expecting
240 features as input.
Can you help me get the decision boundaries for the 10 classes if possible? Thanks for your time!
Here is my code:
from sklearn.datasets import make_classification
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
f, (ax1,ax2) = plt.subplots(nrows=1, ncols=2,figsize=(20,8))
# Generate noisy Data
num_trainsamples = 500
num_testsamples = 50
X_train,y_train = make_classification(n_samples=num_trainsamples,
n_features=240,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=9,
flip_y=0.2,
#weights=[0.5,0.5],
random_state=17)
X_test,y_test = make_classification(n_samples=50,
n_features=num_testsamples,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=10,
flip_y=0.2,
#weights=[0.5,0.5],
random_state=17)
model = RandomForestClassifier()
parameter_space = {
'n_estimators': [10,50,100],
'criterion': ['gini', 'entropy'],
'max_depth': np.linspace(10,50,11),
}
clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
my_model = clf.fit(X_train, y_train)
# define bounds of the domain
min1, max1 = X_train[:, 0].min()-1, X_train[:, 0].max()+1
min2, max2 = X_train[:, 1].min()-1, X_train[:, 1].max()+1
# define the x and y scale
x1grid = np.arange(min1, max1, 0.1)
x2grid = np.arange(min2, max2, 0.1)
# create all of the lines and rows of the grid
xx, yy = np.meshgrid(x1grid, x2grid)
# flatten each grid to a vector
r1, r2 = xx.flatten(), yy.flatten()
r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
# horizontal stack vectors to create x1,x2 input for the model
grid = np.hstack((r1,r2))
yhat = clf.predict(grid)
# reshape the predictions back into a grid
zz = yhat.reshape(xx.shape)
# plot the grid of x, y and z values as a surface
plt.contourf(xx, yy, zz, cmap='Paired')
# create scatter plot for samples from each class
for class_value in range(2):
# get row indexes for samples with this class
row_ix = np.where(y == class_value)
# create scatter of these samples
plt.scatter(X_train[row_ix, 0], X_train[row_ix, 1], cmap='Paired')

How to forecast actual future values using XGBoost?

So I have a solar Irradiation dataset having around 61000+ rows & 2 columns. I have made the model using XGBoost to predict the future values.
I have splitted the data in 2 parts train and test and trained the model accordingly. Furthermore, I have made the predictions on the test data set. Everything is going fine. But now I want to predict the actual forecast How can I do that ??
import os
import pandas as pd
import numpy as np
import xgboost
import matplotlib.pyplot as plt
from xgboost import plot_importance
from sklearn import metrics
# Dataset
df=pd.read_csv('Readings_last_7yr.csv')
df.index = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S')
## Copied the dataset
df2 = df.copy()
del df2['Date']
## Test Train Split
from pandas import read_csv
from matplotlib import pyplot
# series = read_csv('sunspots.csv', header=0, index_col=0)
X = df2
train_size = int(len(X) * 0.75)
train, test = X[0:train_size], X[train_size:len(X)]
print('Observations: %d' % (len(X)))
print('Training Observations: %d' % (len(train)))
print('Testing Observations: %d' % (len(test)))
pyplot.plot(train)
pyplot.plot(test)
pyplot.show()
## Creating Features
def create_features(df, target_variable):
df['date'] = df.index
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.weekofyear
X = df[['hour','dayofweek','quarter','month','year',
'dayofyear','dayofmonth','weekofyear']]
if target_variable:
y = df[target_variable]
return X, y
return X
## METRICS
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def timeseries_evaluation_metrics_func(y_true, y_pred):
print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
print(f'MAPE is : {mean_absolute_percentage_error(y_true, y_pred)}')
print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')
train_copy = train.copy()
test_copy = test.copy()
trainX, trainY = create_features(train_copy, target_variable='Irr')
testX, testY = create_features(test_copy, target_variable='Irr')
xgb = XGBRegressor(objective= 'reg:linear', n_estimators=1000)
xgb
xgb.fit(trainX, trainY,
eval_set=[(trainX, trainY), (testX, testY)],
early_stopping_rounds=50,
verbose=False)
# Predictions
predicted_results = xgb.predict(testX)
# Metrics
timeseries_evaluation_metrics_func(testY, predicted_results)
# Plotting graph for test and Predicted
plt.figure(figsize=(13,8))
plt.plot(list(testY))
plt.plot(list(predicted_results))
plt.title("Actual vs Predicted")
plt.ylabel("Irr")
plt.legend(('Actual','predicted'))
plt.show()
# Making graph of predicted on the whole dataframe
test['Prediction'] = predicted_results
Irr_all = pd.concat([test, train], sort=False)
Irr_all = Irr_all.rename(columns={'Irradiation':'Original_Value'})
Overview_Complete_Data_And_Prediction = Irr_all[['Irr','Prediction']].plot(figsize=(12, 5))
I am getting the results as:
So now I want to predict future values from the year 2021 till 2023.
FUTURE PREDICTION
dti = pd.date_range("2021-01-01 00:30:00", periods=20000, freq="H")
df_future_dates = pd.DataFrame(dti, columns = ['Date'])
df_future_dates['Irr'] = np.nan
df_future_dates.index = pd.to_datetime(df_future_dates['Date'], format='%Y-%m-%d %H:%M:%S')
df_future_dates_copy = df_future_dates.copy()
testX_future, testY_future = create_features(df_future_dates, target_variable='Irr')
xgb = XGBRegressor(objective= 'reg:linear', n_estimators=1000)
xgb
## Now here I have used train and test from above
xgb.fit(trainX, trainY,
eval_set=[(trainX, trainY), (testX, testY)],
early_stopping_rounds=50,
verbose=False)
predicted_results_future = xgb.predict(testX_future)
# Graph
plt.figure(figsize=(13,8))
plt.plot(list(predicted_results_future))
plt.title("Predicted")
plt.ylabel("Irr")
plt.legend(('predicted'))
plt.show()
df_future_dates_copy['Prediction'] = predicted_results_future
Irr_all_future = pd.concat([df2, df_future_dates_copy], sort=False)
# Future Graph
Overview_Complete_Data_And_Prediction_future = Irr_all_future[['Irr','Prediction']].plot(figsize=(15, 5))

Why does Mean-Shift Clustering return 1 for anomaly detection in images? Which paramterers are important for this algorithm?

I want to implement an anomaly detection algorithm for images and used K-Means Clustering there the number of clusters are not known. I tried to use Elbow Methods but its very difficult for me to anaylze and I also used Mean Shift Algorithm but it returns 1 for the number of clusters. What kind of algorithm should I use then? or is it good idea to use mean shift algorithm for 5000 images?
Here is the code:
import random, cv2, os, sys, shutil
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import keras
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets import make_blobs
class img_clustering:
def __init__(self, folder_path="data", max_examples=None):
paths = os.listdir(folder_path)
if max_examples == None:
self.max_examples = len(paths)
else:
if max_examples > len(paths):
self.max_examples = len(paths)
else:
self.max_examples = max_examples
self.n_clusters = n_clusters
self.folder_path = folder_path
random.shuffle(paths)
self.image_paths = paths[:self.max_examples]
self.use_imagenets = use_imagenets
def load_images(self):
self.images = []
for image in self.image_paths:
img = cv2.imread(self.folder_path + "/" + image)
img = cv2.resize(img,(224,224))
self.images.append(img)
self.images = np.float32(self.images)#
self.images /= 255
def get_new_imagevectors(self):
model1 = keras.applications.MobileNetV2(input_shape=(224,224,3), alpha=1.0, include_top=False, weights='imagenet', pooling=None)
model1.summary()
pred = model1.predict(self.images)
images_temp = pred.reshape(self.images.shape[0], -1)
model2 = PCA(n_components=None, random_state=728)
model2.fit(images_temp)
self.images_new = model2
def clustering(self):
X, _ = make_blobs(n_samples=10000, centers=self.images_new, cluster_std=0.6)
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
model = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True,
n_jobs=None, seeds=None)
model.fit(X)
labels = model.labels_
cluster_centers = model.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
predictions = model.predict(self.images_new)
print("\n")
print("The number of clusters:",n_clusters_)
print("The number of labels_unique:",labels_unique)
print("\n")
if __name__=='__main__':
data_path = "Path" # path of the folder that contains the images
max_examples = 5000
strt = img_clustering(data_path, max_examples)
strt.load_images()
strt.get_new_imagevectors()
strt.clustering()
The elbow method showing optimal k but I don't know what the optimal k numbers can be in this plot.
enter image description here

Scikit-Learn's Logistic Regression severely overfits digit classification training data

I'm using Scikit-Learn's Logistic Regression algorithm to perform digit classification. The dataset I'm using is Scikit-Learn's load_digits.
Below is a simplified version of my code:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits
digits = load_digits()
model = LogisticRegression(solver ='lbfgs',
penalty = 'none',
max_iter = 1e5,
multi_class = 'auto')
model.fit(digits.data, digits.target)
predictions = model.predict(digits.data)
df_cm = pd.DataFrame(confusion_matrix(digits.target, predictions))
ax = sns.heatmap(df_cm, annot = True, cbar = False, cmap = 'Blues_r', fmt='d', annot_kws = {"size": 10})
ax.set_ylim(0,10)
plt.title("Confusion Matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
train_size = [0.2, 0.4, 0.6, 0.8, 1]
training_size, training_score, validation_score = learning_curve(model, digits.data, digits.target, cv = 5,
train_sizes = train_size, scoring = 'neg_mean_squared_error')
training_scores_mean = - training_score.mean(axis = 1)
validation_score_mean = - validation_score.mean(axis = 1)
plt.plot(training_size, validation_score_mean)
plt.plot(training_size, training_scores_mean)
plt.legend(["Validation error", "Training error"])
plt.ylabel("MSE")
plt.xlabel("Training set size")
plt.show()
### EDIT ###
# With L2 regularization
model = LogisticRegression(solver ='lbfgs',
penalty = 'l2', # Changing penality to l2
max_iter = 1e5,
multi_class = 'auto')
model.fit(digits.data, digits.target)
predictions = model.predict(digits.data)
df_cm = pd.DataFrame(confusion_matrix(digits.target, predictions))
ax = sns.heatmap(df_cm, annot = True, cbar = False, cmap = 'Blues_r', fmt='d', annot_kws = {"size": 10})
ax.set_ylim(0,10)
plt.title("Confusion Matrix with L2 regularization")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
training_size, training_score, validation_score = learning_curve(model, digits.data, digits.target, cv = 5,
train_sizes = train_size, scoring = 'neg_mean_squared_error')
training_scores_mean = - training_score.mean(axis = 1)
validation_score_mean = - validation_score.mean(axis = 1)
plt.plot(training_size, validation_score_mean)
plt.plot(training_size, training_scores_mean)
plt.legend(["Validation error", "Training error"])
plt.title("Learning curve with L2 regularization")
plt.ylabel("MSE")
plt.xlabel("Training set size")
plt.show()
# With L2 regularization and best C
from sklearn.model_selection import GridSearchCV
C = {'C': [1e-3, 1e-2, 1e-1, 1, 10]}
model_l2 = GridSearchCV(LogisticRegression(random_state = 0, solver ='lbfgs', penalty = 'l2', max_iter = 1e5, multi_class = 'auto'),
param_grid = C, cv = 5, iid = False, scoring = 'neg_mean_squared_error')
model_l2.fit(digits.data, digits.target)
best_C = model_l2.best_params_.get("C")
print(best_C)
model_reg = LogisticRegression(solver ='lbfgs',
penalty = 'l2',
C = best_C,
max_iter = 1e5,
multi_class = 'auto')
model_reg.fit(digits.data, digits.target)
predictions = model_reg.predict(digits.data)
df_cm = pd.DataFrame(confusion_matrix(digits.target, predictions))
ax = sns.heatmap(df_cm, annot = True, cbar = False, cmap = 'Blues_r', fmt='d', annot_kws = {"size": 10})
ax.set_ylim(0,10)
plt.title("Confusion Matrix with L2 regularization and best C")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
training_size, training_score, validation_score = learning_curve(model_reg, digits.data, digits.target, cv = 5,
train_sizes = train_size, scoring = 'neg_mean_squared_error')
training_scores_mean = - training_score.mean(axis = 1)
validation_score_mean = - validation_score.mean(axis = 1)
plt.plot(training_size, validation_score_mean)
plt.plot(training_size, training_scores_mean)
plt.legend(["Validation error", "Training error"])
plt.title("Learning curve with L2 regularization and best C")
plt.ylabel("MSE")
plt.xlabel("Training set size")
plt.show()
As can be seen from the confusion matrix for the training data and from the last plot, generated using learning_curve, the error on the training set is always 0:
Learning Curve Plot Here
It seems to me that the model is massively overfitting, and I'm can't make sense out of it. I've tried this using the MNIST dataset as well, and the same thing happens.
How can I solve this?
-- EDIT --
Added above the code for L2 regularization, and with the best value for the hyperparameter C.
With L2 regularization, the model still overfits the data:
Learning Curve with L2 regularization here
With the best C hyperparameter the error on the training data is no longer zero, but the algorithm still overfits:
Learning Curve with L2 regularization here and best C here
Still don't understand what's happening...
Use a regularization term (penalty) instead of 'none'.
model = LogisticRegression(solver ='lbfgs',
penalty = 'l2',
max_iter = 1e5,
multi_class = 'auto')
The optimal value for C you find doing a validation curve.

Binary Classification using logistic regression with Tensorflow

I just too an ML course and am trying to get better at tensorflow. To that end, I purchased the book by Nishant Shukhla (ML with tensorflow) and am trying to run the 2 feature example with a different data set.
With the fake dataset in the book, my code runs fine. However, with data I used in the ML course, the code refuses to converge. With a really small learning rate it does converge, but the learned weights are wrong.
Also attaching the plot of the feature data. It should not a feature scaling issue as values on both features vary between 30-100 units.
I am really struggling with how opaque tensorflow is- any help would be appreciated:
""" Solution for simple logistic regression model
"""
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt
# Define paramaters for the model
learning_rate = 0.0001
training_epochs = 300
data = np.loadtxt('ex2data1.txt', delimiter=',')
x1s = np.array(data[:,0]).astype(np.float32)
x2s = np.array(data[:,1]).astype(np.float32)
ys = np.array(data[:,2]).astype(np.float32)
print('Plotting data with + indicating (y = 1) examples and o \n indicating (y = 0) examples.\n')
color = ['red' if l == 0 else 'blue' for l in ys]
myplot = plt.scatter(x1s, x2s, color = color)
# Put some labels
plt.xlabel("Exam 1 score")
plt.ylabel("Exam 2 score")
# Specified in plot order
plt.show()
# Step 2: Create datasets
X1 = tf.placeholder(tf.float32, shape=(None,), name="x1")
X2 = tf.placeholder(tf.float32, shape=(None,), name="x2")
Y = tf.placeholder(tf.float32, shape=(None,), name="y")
w = tf.Variable(np.random.rand(3,1), name='w', dtype='float32',trainable=True)
y_model = tf.sigmoid(w[2]*X2 + w[1]*X1 + w[0])
cost = tf.reduce_mean(-tf.log(y_model*Y + (1-y_model)*(1-Y)))
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
writer = tf.summary.FileWriter('./graphs/logreg', tf.get_default_graph())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
prev_error = 0.0;
for epoch in range(training_epochs):
error, loss = sess.run([cost, train_op], feed_dict={X1:x1s, X2:x2s, Y:ys})
print("epoch = ", epoch, "loss = ", loss)
if abs(prev_error - error) < 0.0001:
break
prev_error = error
w_val = sess.run(w, {X1:x1s, X2:x2s, Y:ys})
print("w learned = ", w_val)
writer.close()
sess.close()
Both X1 and X2 range from ~20-100. However, once I scaled them, the solution converged just fine.

Resources