K Means Clustering with Python - machine-learning

FutureWarning: Unlike other reduction functions (e.g. skew,kurtosis), the default behavior of mode typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of keepdims will become False, the axis over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set keepdims to True or False to avoid this warning.
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
This is my code and I am not being able to get read of this warning:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('Classified Data',index_col=0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop('TARGET CLASS',axis=1))
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
from sklearn.model_selection import train_test_split
X = df_feat
y = df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
predictions = knn.predict(X_test)
The warning is showing up after running the last code line
enter image description here

Related

The number of classes has to be greater than one; got 1 class in SVM

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = np.linspace(-5.0, 5.0, 100)
y = np.sqrt(10**2 - x**2)
y=np.hstack([y,-y])
x=np.hstack([x,-x])
x1 = np.linspace(-5.0, 5.0, 100)
y1 = np.sqrt(5**2 - x1**2)
y1=np.hstack([y1,-y1])
x1=np.hstack([x1,-x1])
plt.scatter(y,x)
plt.scatter(y1,x1)
# print(plt.show())
import pandas as pd
df1 =pd.DataFrame(np.vstack([y,x]).T,columns=['X1','X2'])
df1['Y']=0
df2 =pd.DataFrame(np.vstack([y1,x1]).T,columns=['X1','X2'])
df2['Y']=1
df1.merge(df2)
# We need to find components for the Polynomical Kernel
#X1,X2,X1_square,X2_square,X1*X2
df1['X1_Square']= df1['X1']**2
df1['X2_Square']= df1['X2']**2
df1['X1*X2'] = (df1['X1'] *df1['X2'])
# print(df1.head())
### Independent and Dependent features
X = df1[['X1','X2','X1_Square','X2_Square','X1*X2']]
y = df1['Y']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25,
random_state = 0)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
classifier = SVC(kernel="linear")
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)
ValueError: The number of classes has to be greater than one; got 1 class
I Don't know How to resolve this one error.May be there is error in merge of two data frames or I to append df1 and df2 but i tried it that doesn't work.
The error is because y has only one value of 0 because your code logic is at line y = df1['Y'].
You can replace line df1.merge(df2) code like this:
df1 = pd.concat([df1,df2])

ufunc 'isnan' not supported for the input types / could not convert string to float: for dates

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
appl = pd.read_csv('appl.csv') #apple stock prices taken from finance.yahoo.com
x = appl[['Date', 'Open','High','Low','Close']]
y = pd.to_datetime(appl['Date'])
import numpy as np
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)
model = LinearRegression()
#model.fit(np.isnan(x_train),np.isnan(y_train))
model.fit(x_train,y_train) #<--- receive error on this line
model.coef_
model.intercept_
I get when using model.fit(x_train,y_train) the errors ufunc 'isnan' not supported for the input types
and when not using model.fit(np.isnan(x_train),np.isnan(y_train)) could not convert string to float:

ValueError when making predictions with LinearRegression

I have started learning ML.
This is my code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Import the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values
# Split the data set into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =\
train_test_split(X, Y, test_size=1/3, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Fitting Simple Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train , Y_train)
# Predicting the Test set Results
y_pred = regressor.predict(X_test)
I am getting the error:
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a
minimum of 1 is required.
for the last line. How to resolve this??

Implement Logistc Regression with L2 regularization Using SGD: without using sklearn

import math
from math import log10
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def sigmoid(w,x,b):
return(1/(1+math.exp(-(np.dot(x,w)+b))))
def l2_regularizer(w):
l2_reg_sum=0.0
for i in range(len(w)):
l2_reg_sum+=(w[i]**2)
return l2_reg_sum
def compute_log_loss(X_train,y_train,w,b,alpha):
loss=0.0
X_train=np.clip(X_train, alpha, 1-alpha)
for i in range(N):
loss+= ((y_train[i]*log10(sigmoid(w,X_train[i],b)))+((1-y_train[i])*log10(1-sigmoid(w,X_train[i],b))))
#loss =-1*np.mean(actual*np.log(predicted)+(1-actual))*np.log(1-predicted)
#loss=-1*np.mean(y_train*np.log(sigmoid(w,X_proba,b))+(1-y_train))*np.log(1-sigmoid(w,X_proba,b))
loss=((-1/N)*loss)
return loss
X, y = make_classification(n_samples=50000, n_features=15, n_informative=10, n_redundant=5,
n_classes=2, weights=[0.7], class_sep=0.7, random_state=15)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)
w = np.zeros_like(X_train[0])
b = 0
eta0 = 0.0001
alpha = 0.0001
N = len(X_train)
n_epochs = 3
W=[]
B=[]
W.append(w)
B.append(b)
loss_list=[]
log_loss_train=0.0
log_loss_train=compute_log_loss(X_train,y_train,w,b,alpha)
loss_list.append(log_loss_train)
print(loss_list)
for epoch in range(1,n_epochs):
grad_loss=0.0
grad_intercept=0.0
for i in range(N):
first_term_grad_loss=((1-((alpha*eta0)/N))*W[epoch-1])
second_term_grad_loss=((eta0*X_train[i])*(y_train[i]-sigmoid(W[epoch-1],X_train[i],B[epoch-1])))
grad_loss+=(first_term_grad_loss+second_term_grad_loss)
first_term_grad_intercept=B[epoch-1]
second_term_grad_intercept=(eta0*(y_train[i]-sigmoid(W[epoch-1],X_train[i],B[epoch-1])))
grad_intercept+=(first_term_grad_intercept+second_term_grad_intercept)
B.append(grad_intercept)
W.append(grad_loss)
log_loss_train=0.0
log_loss_train=compute_log_loss(X_train,y_train,W[epoch],B[epoch],alpha)
loss_list.append(log_loss_train)
print(loss_list)
I am getting math range error while calculating the Sigmoid and i am not able to understand how to handle this.sigmoid calculation throwing error because of may be some large calculation.
File "C:\Users\SUMO.spyder-py3-dev\temp.py", line 12, in sigmoid return(1/(1+math.exp(-(np.dot(x,w)+b)))) OverflowError: math range error.
First, you need to identify your hypothesis is positive or negative. Then handle problems separately for positive and negative hypotheses like below.
def sigmoid(w,x,b):
hypothesis = np.dot(x,w)+b
if hypothesis < 0:
return (1 - 1/(1+math.exp(hypothesis)))
return (1/(1+math.exp(-hypothesis)))
Try to use np.exp() instead of math.exp(-(np.dot(x,w)+b)) because math.exp works on scalar values and np.exp() works on np arrays.

label_binarize Does not fit for sklearn Naive Bayes classifier showing bad input shape

I was trying to create roc curve for multiclass using Naive Bayes But it ending with
ValueError: bad input shape.
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import BernoulliNB
from scipy import interp
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = BernoulliNB(alpha=1.0, binarize=6, class_prior=None, fit_prior=True)
y_score = classifier.fit(X_train, y_train).predict(X_test)
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (75, 6)
The error because of binarizing the y variable. The estimator can work with string values itself.
Remove the following lines,
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
You are good to go!
To get the predicted probabilities for roc_curve, use the following:
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_score.shape
# (75, 3)

Resources