Implement Logistc Regression with L2 regularization Using SGD: without using sklearn - machine-learning

import math
from math import log10
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def sigmoid(w,x,b):
return(1/(1+math.exp(-(np.dot(x,w)+b))))
def l2_regularizer(w):
l2_reg_sum=0.0
for i in range(len(w)):
l2_reg_sum+=(w[i]**2)
return l2_reg_sum
def compute_log_loss(X_train,y_train,w,b,alpha):
loss=0.0
X_train=np.clip(X_train, alpha, 1-alpha)
for i in range(N):
loss+= ((y_train[i]*log10(sigmoid(w,X_train[i],b)))+((1-y_train[i])*log10(1-sigmoid(w,X_train[i],b))))
#loss =-1*np.mean(actual*np.log(predicted)+(1-actual))*np.log(1-predicted)
#loss=-1*np.mean(y_train*np.log(sigmoid(w,X_proba,b))+(1-y_train))*np.log(1-sigmoid(w,X_proba,b))
loss=((-1/N)*loss)
return loss
X, y = make_classification(n_samples=50000, n_features=15, n_informative=10, n_redundant=5,
n_classes=2, weights=[0.7], class_sep=0.7, random_state=15)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)
w = np.zeros_like(X_train[0])
b = 0
eta0 = 0.0001
alpha = 0.0001
N = len(X_train)
n_epochs = 3
W=[]
B=[]
W.append(w)
B.append(b)
loss_list=[]
log_loss_train=0.0
log_loss_train=compute_log_loss(X_train,y_train,w,b,alpha)
loss_list.append(log_loss_train)
print(loss_list)
for epoch in range(1,n_epochs):
grad_loss=0.0
grad_intercept=0.0
for i in range(N):
first_term_grad_loss=((1-((alpha*eta0)/N))*W[epoch-1])
second_term_grad_loss=((eta0*X_train[i])*(y_train[i]-sigmoid(W[epoch-1],X_train[i],B[epoch-1])))
grad_loss+=(first_term_grad_loss+second_term_grad_loss)
first_term_grad_intercept=B[epoch-1]
second_term_grad_intercept=(eta0*(y_train[i]-sigmoid(W[epoch-1],X_train[i],B[epoch-1])))
grad_intercept+=(first_term_grad_intercept+second_term_grad_intercept)
B.append(grad_intercept)
W.append(grad_loss)
log_loss_train=0.0
log_loss_train=compute_log_loss(X_train,y_train,W[epoch],B[epoch],alpha)
loss_list.append(log_loss_train)
print(loss_list)
I am getting math range error while calculating the Sigmoid and i am not able to understand how to handle this.sigmoid calculation throwing error because of may be some large calculation.
File "C:\Users\SUMO.spyder-py3-dev\temp.py", line 12, in sigmoid return(1/(1+math.exp(-(np.dot(x,w)+b)))) OverflowError: math range error.

First, you need to identify your hypothesis is positive or negative. Then handle problems separately for positive and negative hypotheses like below.
def sigmoid(w,x,b):
hypothesis = np.dot(x,w)+b
if hypothesis < 0:
return (1 - 1/(1+math.exp(hypothesis)))
return (1/(1+math.exp(-hypothesis)))

Try to use np.exp() instead of math.exp(-(np.dot(x,w)+b)) because math.exp works on scalar values and np.exp() works on np arrays.

Related

Unexpected behaviour (inflated results on random-data) in scikit-learn with nested cross-validation

When trying to train/evaluate a support vector machine in scikit-learn, I am experiencing some unexpected behaviour and I am wondering whether I am doing something wrong or that this is a possible bug.
In a very specific subset of circumstances, nested cross-validation using GridSearchCV and SVM, provides inflated predictive results, even with randomly generated data.
For instance, see this code:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, LeaveOneOut
from sklearn.metrics import roc_auc_score, brier_score_loss
from tqdm import tqdm
import pandas as pd
N = 20
N_FEATURES = 50
param_grid = {'C': [1e-5, 1e-3, 1, 1e3, 1e5]}
scores = []
for z in tqdm(range(100)):
X = np.random.uniform(size=(N, N_FEATURES))
y = np.random.binomial(1, 0.5, size=N)
if z < 10:
y = np.array([0, 1] * int(N/2))
y = np.random.permutation(y)
for skf_outer in [StratifiedKFold(n_splits=5), LeaveOneOut()]:
for skf_inner in [5, LeaveOneOut()]:
for model in [svm.SVC(probability=True), LogisticRegression()]:
y_pred, y_real = [], []
for train_index, test_index in skf_outer.split(X, y):
X_train, X_test = X[train_index], X[test_index, :]
y_train, y_test = y[train_index], y[test_index]
clf = GridSearchCV(
model, param_grid, cv=skf_inner, n_jobs=-1, scoring='neg_brier_score'
)
clf.fit(X_train, y_train)
predictions = clf.predict_proba(X_test)[:, 1]
y_pred.extend(predictions)
y_real.extend(y_test)
scores.append([str(skf_outer), str(skf_inner), str(model), np.mean(y), brier_score_loss(np.array(y_real), np.array(y_pred)), roc_auc_score(np.array(y_real), np.array(y_pred))])
df_scores = pd.DataFrame(scores)
df_scores.columns = ['skf_outer', 'skf_inner', 'model', 'y_label', 'brier', 'auc']
df_scores['y_0.5'] = df_scores['y_label'] == 0.5
df_scores = df_scores.groupby(['skf_outer', 'skf_inner', 'model', 'y_0.5']).mean()
print(df_scores)
In the following circumstances:
Both in the inner- and outerloop of the CV, LeaveOneOut() is used
The SVM is used
The y labels are balanced (i.e. the mean of y is 0.5)
The predictions are much better than expected by random chance (AUC>0.9, sometimes even 1, Brier of 0.15 or lower). I can replicate this generating more samples, more features etc - the issue stays the same. Swapping the SVM for LogisticRegression (as shown in the analysis above), leads to expected results (AUC 0.5, Brier of 0.25). And for the other scenario's (no LOO-CV in either inner or outer loop, or a different distribution of y labels), the results are as expected.
Can anyone replicate this? Am I missing something obvious?
I've replicated this with an older version of sklearn (0.24.0) and the newest one (1.2.0).

The number of classes has to be greater than one; got 1 class in SVM

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = np.linspace(-5.0, 5.0, 100)
y = np.sqrt(10**2 - x**2)
y=np.hstack([y,-y])
x=np.hstack([x,-x])
x1 = np.linspace(-5.0, 5.0, 100)
y1 = np.sqrt(5**2 - x1**2)
y1=np.hstack([y1,-y1])
x1=np.hstack([x1,-x1])
plt.scatter(y,x)
plt.scatter(y1,x1)
# print(plt.show())
import pandas as pd
df1 =pd.DataFrame(np.vstack([y,x]).T,columns=['X1','X2'])
df1['Y']=0
df2 =pd.DataFrame(np.vstack([y1,x1]).T,columns=['X1','X2'])
df2['Y']=1
df1.merge(df2)
# We need to find components for the Polynomical Kernel
#X1,X2,X1_square,X2_square,X1*X2
df1['X1_Square']= df1['X1']**2
df1['X2_Square']= df1['X2']**2
df1['X1*X2'] = (df1['X1'] *df1['X2'])
# print(df1.head())
### Independent and Dependent features
X = df1[['X1','X2','X1_Square','X2_Square','X1*X2']]
y = df1['Y']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25,
random_state = 0)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
classifier = SVC(kernel="linear")
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)
ValueError: The number of classes has to be greater than one; got 1 class
I Don't know How to resolve this one error.May be there is error in merge of two data frames or I to append df1 and df2 but i tried it that doesn't work.
The error is because y has only one value of 0 because your code logic is at line y = df1['Y'].
You can replace line df1.merge(df2) code like this:
df1 = pd.concat([df1,df2])

label_binarize Does not fit for sklearn Naive Bayes classifier showing bad input shape

I was trying to create roc curve for multiclass using Naive Bayes But it ending with
ValueError: bad input shape.
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import BernoulliNB
from scipy import interp
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = BernoulliNB(alpha=1.0, binarize=6, class_prior=None, fit_prior=True)
y_score = classifier.fit(X_train, y_train).predict(X_test)
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (75, 6)
The error because of binarizing the y variable. The estimator can work with string values itself.
Remove the following lines,
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
You are good to go!
To get the predicted probabilities for roc_curve, use the following:
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_score.shape
# (75, 3)

raise ValueError("Unknown label type: %s" % repr(ys)) ValueError: Unknown label type: (array

Im trying to make a Machine Learning approach but I'm having some problems. This is my Code:
import sys
import scipy
import numpy
import matplotlib
import pandas
import sklearn
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
dataset = pandas.read_csv('Libro111.csv')
array = numpy.asarray(dataset,dtype=numpy.float64) #all values are float64
X = array[:,1:49]
Y = array[:,0]
validation_size = 0.2
seed = 7.0
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
scoring = 'accuracy'
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
And then I get two different errors.
For Logistic Regression:
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 172, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'
I found someone who had the same problems but I couldn't sort it out yet..
And (most important):
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 97, in unique_labels
raise ValueError("Unknown label type: %s" % repr(ys))
ValueError: Unknown label type: (array([ 0.5, 0. , 1. , 1. , 0.5, 0.5, 1. , 0.5, 0. , 0.5, 1. ,
0. , 0. , 0. , 1. , 1......
In both cases the error come when I execute "cv_result" line... So, I hope you can help me...
"ValueError: Unknown label type: 'continuous'" means Your "Y" values are not class type of data (multiple rows share a same integer value. each integer represent a class). Therefore, you cannot use "DecisionTreeClassifier", "KNeighborsClassifier", "LogisticRegression"(do not be fooled by its name, LogisticRegression is a boolean classification method) or any other classification machine learning methods. In reality, your "Y" values are all different or 'continuous' (probably are float numbers), so you can only use the regression machine learning (i.e. "RandomForestRegressor").
Here are two solutions:
a) Group Y values into bins (classes). Apply classification modeling to your data.
b) If you prefer your predictions to have values (float numbers), You need to use the regression machine learning methods to predict Y values.
By the way, the "scoring = 'accuracy'" evaluation method is for classification modeling.

How to encode categorical data for use with Semi-supervised algorithm LabelPropagation

I am attempting to use the anneal.arff dataset with Python scikit-learn's semisupervised algorithm LabelPropagation. The anneal dataset is categorical data, so I preprocessed it so that the output class for each item of instance
looks like [0. 0. 1. 0. 0.]. This is a numeric list that encodes the output class
as 5 possible values with 0's everywhere, and 1. in the position of the corresponding class. This is what I would expect.
For semi-supervised learning, most of the training data must be unlabeled, so
I modified the training set so that the unlabeled data has output [-1, -1, -1, -1, -1]. I previously tried just using -1, but the code emits the same error as shown below.
I train the classifier as follows, Y_train includes labeled and "unlabeled" data:
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X, Y_train)
I receive the error shown below after calling the fit method:
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\semi_supervised\label_propagation.py", line 221, in fit
X, y = check_X_y(X, y)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 526, in check_X_y
y = column_or_1d(y, warn=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 562, in column_or_1d
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (538, 5)
This suggests that something is wrong with the shape of my Y_train list,
but this is the correct shape. What am I doing wrong?
Can LabelPropagation take as training data in this form, or does it only
accept unlabeled data as a scalar -1?
--- edit ---
Here is the code that generates the error. I'm sorry about the confusion over algorithms--I want to use both LabelSpreading and LabelPropagation, and choosing one or the other doesn't fix this error.
from scipy.io import arff
import pandas as pd
import numpy as np
import math
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from copy import deepcopy
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
f = "../../Documents/UCI/anneal.arff"
dataAsRecArray, meta = arff.loadarff(f)
dataset_raw = pd.DataFrame.from_records(dataAsRecArray)
dataset = pd.get_dummies(dataset_raw)
class_names = [col for col in dataset.columns if 'class_' in col]
print (dataset.shape)
number_of_output_columns = len(class_names)
print (number_of_output_columns)
def run(name, model, dataset, percent):
# Split-out validation dataset
array = dataset.values
X = array[:, 0:-number_of_output_columns]
Y = array[:, -number_of_output_columns:]
validation_size = 0.40
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
num_samples = len(Y_train)
num_labeled_points = math.floor(percent*num_samples)
indices = np.arange(num_samples)
unlabeled_set = indices[num_labeled_points:]
Y_train[unlabeled_set] = [-1, -1, -1, -1, -1]
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X_train, Y_train)
"""
predicted_labels = lp_model.transduction_[unlabeled_set]
print(predicted_labels[:10])
"""
if __name__ == "__main__":
#percentages = [0.1, 0.2, 0.3, 0.4]
percentages = [0.1]
models = []
models.append(('LS', LabelSpreading()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
for percent in percentages:
run(name, model, dataset, percent)
print ("bye")
Your Y_train has shape (538, 5) but should be 1d. LabelPropagation doesn't support multi-label or multi-output multi-class right now.
The error message could be more informative, though :-/

Resources