How to use Recursive Feature elimination? - machine-learning

I am new to ML and have been trying Feature selection with RFE approach. My dataset has 5K records and its binary classification problem. This is the code that I am following based on a tutorial online
#no of features
nof_list=np.arange(1,13)
high_score=0
#Variable to store the optimum features
nof=0
score_list =[]
for n in range(len(nof_list)):
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
model = RandomForestClassifier()
rfe = RFE(model,nof_list[n])
X_train_rfe = rfe.fit_transform(X_train,y_train)
X_test_rfe = rfe.transform(X_test)
model.fit(X_train_rfe,y_train)
score = model.score(X_test_rfe,y_test)
score_list.append(score)
if(score>high_score):
high_score = score
nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))
I encounter the below error. Can someone please help
TypeError Traceback (most recent call last)
<ipython-input-332-a23dfb331001> in <module>
9 model = RandomForestClassifier()
10 rfe = RFE(model,nof_list[n])
---> 11 X_train_rfe = rfe.fit_transform(X_train,y_train)
12 X_test_rfe = rfe.transform(X_test)
13 model.fit(X_train_rfe,y_train)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
554 Training set.
555
--> 556 y : numpy array of shape [n_samples]
557 Target values.
558
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in transform(self, X)
75 X = check_array(X, dtype=None, accept_sparse='csr',
76 force_all_finite=not tags.get('allow_nan', True))
---> 77 mask = self.get_support()
78 if not mask.any():
79 warn("No features were selected: either the data is"
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in get_support(self, indices)
44 values are indices into the input feature vector.
45 """
---> 46 mask = self._get_support_mask()
47 return mask if not indices else np.where(mask)[0]
48
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_selection\_rfe.py in _get_support_mask(self)
269
270 def _get_support_mask(self):
--> 271 check_is_fitted(self)
272 return self.support_
273
TypeError: check_is_fitted() missing 1 required positional argument: 'attributes'

What is your sklearn version ?
The following (using artificial data) should work fine:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
X = np.random.rand(100,20)
y = np.ones((X.shape[0]))
#no of features
nof_list=np.arange(1,13)
high_score=0
#Variable to store the optimum features
nof=0
score_list =[]
for n in range(len(nof_list)):
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
model = RandomForestClassifier()
rfe = RFE(model,nof_list[n])
X_train_rfe = rfe.fit_transform(X_train,y_train)
X_test_rfe = rfe.transform(X_test)
model.fit(X_train_rfe,y_train)
score = model.score(X_test_rfe,y_test)
score_list.append(score)
if(score>high_score):
high_score = score
nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))
Optimum number of features: 1
Score with 1 features: 1.000000
Versions tested:
sklearn.__version__
'0.20.4'
sklearn.__version__
'0.21.3'

Related

show the visualization of the dataset before and after classification

Kindly, I am trying to create an ML model using SVM using a dataset with 23 features and the output should be (0 or 1) which means two classes. My Target to show the visualization before and after the classification.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
ds= pd.read_csv("dataset_sdn.csv")
ds= ds.fillna(0)
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
X_Train, X_Test, Y_Train, Y_Test = train_test_split (X, Y, test_size=0.25, random_state=0)
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)
Then i tried to visualization the X and Y
import matplotlib.pyplot as plt
plt.scatter(X,Y)
plt.show()
But i got error
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [11], in <cell line: 2>()
1 import matplotlib.pyplot as plt
----> 2 plt.scatter(X,Y)
3 plt.show()
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\pyplot.py:2817, in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, data, **kwargs) 2812 #_copy_docstring_and_deprecators(Axes.scatter) 2813 def scatter( 2814 x, y, s=None, c=None, marker=None, cmap=None, norm=None, 2815 vmin=None, vmax=None, alpha=None, linewidths=None, *, 2816 edgecolors=None, plotnonfinite=False, data=None,
**kwargs):
-> 2817 __ret = gca().scatter( 2818 x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm, 2819 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths, 2820 edgecolors=edgecolors, plotnonfinite=plotnonfinite, 2821
**({"data": data} if data is not None else {}), **kwargs) 2822 sci(__ret) 2823 return __ret
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\__init__.py:1414, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs) 1411 #functools.wraps(func) 1412 def inner(ax, *args, data=None,
**kwargs): 1413 if data is None:
-> 1414 return func(ax, *map(sanitize_sequence, args), **kwargs) 1416 bound = new_sig.bind(ax, *args, **kwargs) 1417 auto_label = (bound.arguments.get(label_namer) 1418 or bound.kwargs.get(label_namer))
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\axes\_axes.py:4368, in Axes.scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, **kwargs) 4366 y = np.ma.ravel(y) 4367 if x.size != y.size:
-> 4368 raise ValueError("x and y must be the same size") 4370 if s is None: 4371 s = (20 if rcParams['_internal.classic_mode'] else 4372 rcParams['lines.markersize'] ** 2.0)
ValueError: x and y must be the same size
Then i start the classification
classifier = SVC (kernel='rbf', C=1, random_state=0,)
classifier.fit(X_Train, Y_Train)
Y_pred = classifier.predict(X_Test)
print (Y_pred)
The I tried to use below code but not work
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X=X_Test, y=Y_Test, clf=classifier, legend=1)
I get this error
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [10], in <cell line: 2>()
1 from mlxtend.plotting import plot_decision_regions
----> 2 plot_decision_regions(X=Y_Test, y=X_Test, clf=classifier,legend=2)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\plotting\decision_regions.py:148, in plot_decision_regions(X, y, clf, feature_index, filler_feature_values, filler_feature_ranges, ax, X_highlight, zoom_factor, legend, hide_spines, markers, colors, scatter_kwargs, contourf_kwargs, contour_kwargs, scatter_highlight_kwargs)
44 def plot_decision_regions(
45 X,
46 y, (...)
65 scatter_highlight_kwargs=None,
66 ):
67 """Plot decision regions of a classifier.
68
69 Please note that this functions assumes that class labels are (...)
145
146 """
--> 148 check_Xy(X, y, y_int=True) # Validate X and y arrays
149 dim = X.shape[1]
151 if ax is None:
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\utils\checking.py:16, in check_Xy(X, y, y_int)
12 def check_Xy(X, y, y_int=True):
13
14 # check types
15 if not isinstance(X, np.ndarray):
---> 16 raise ValueError("X must be a NumPy array. Found %s" % type(X))
17 if not isinstance(y, np.ndarray):
18 raise ValueError("y must be a NumPy array. Found %s" % type(y))
ValueError: X must be a NumPy array. Found <class 'pandas.core.series.Series'>
So, please your advice to how show the visualization. Thanks in advance
See here:
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
and then here
plt.scatter(X,Y)
Your X vector is multi-dimensional vector having a size of 16 and hence the scatterplot is not working. You can only plot 2 variables in a 2D scatterplot.
So you have plot for each feature vs Y, e.g.,
plt.scatter(X.iloc[:,0], Y)

ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input

I am a beginner please can someone tell me where I made a mistake in this code
The data set used is kaggle tiatanic
Error is show in 9th cell rest run fine on there own
In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
In [2]:
train_data.dtypes
In [3]:
train_data.isna().sum()
In [4]:
train_data = train_data.fillna(value = {'Age' :0, 'Embarked' :'u'})
In [5]:
train_data.isna().sum()
In [6]:
train_data.shape
In [7]:
test_data = test_data.fillna(value = {'Age' :0, 'Fare' :0})
In [8]:
test_data.shape
In [9]:as in this cell I have specified the features to be used still why it's saying classifier expects 11 features
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
error trace back
ValueError Traceback (most recent call last) <ipython-input-11-a7ceba9b896f> in <module>
7 model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=1)
8 model.fit(X, y)
----> 9 predictions = model.predict(X_test)
10
11 output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in predict(self, X)
628 The predicted classes.
629 """
--> 630 proba = self.predict_proba(X)
631
632 if self.n_outputs_ == 1:
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in predict_proba(self, X)
672 check_is_fitted(self)
673 # Check data
--> 674 X = self._validate_X_predict(X)
675
676 # Assign chunk of trees to jobs
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in
_validate_X_predict(self, X)
420 check_is_fitted(self)
421
--> 422 return self.estimators_[0]._validate_X_predict(X, check_input=True)
423
424 #property
c:\python39\lib\site-packages\sklearn\tree\_classes.py in
_validate_X_predict(self, X, check_input)
405 """Validate the training data on predict (probabilities)."""
406 if check_input:
--> 407 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr",
408 reset=False)
409 if issparse(X) and (X.indices.dtype != np.intc or
c:\python39\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
435
436 if check_params.get('ensure_2d', True):
--> 437 self._check_n_features(X, reset=reset)
438
439 return out
c:\python39\lib\site-packages\sklearn\base.py in
_check_n_features(self, X, reset)
363
364 if n_features != self.n_features_in_:
--> 365 raise ValueError(
366 f"X has {n_features} features, but {self.__class__.__name__} "
367 f"is expecting {self.n_features_in_} features as input.")
ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input
You don't have the same number of features in your train set and in your test set because you use the function pd.get_dummies() on the train set and on the test set separately. You have a value that is in your test set that is not in your train set.
To solve this issue, the best way is to use the function OneHotEncoder() in the module sklearn.preprocessing with the parameter handle_unknown="ignore" :
from sklearn.preprocessing import OneHotEncoder
oneh = OneHotEncoder(handle_unknown="ignore")
oneh.fit(train_data[features])
X_test = oneh.transform(test_data[features])
Moreover, it is not a good choice to have a different preprocessing workflow for the train and the test set (fillna() in your case).

Unable to get output using CNN model

I am trying to use cnn-lstm model on this dataset. I've stored this dataset in dataframe named as df. there are totally 11 column in this dataset but i am just mentioning 9 columns here. All columns have numerical values only
Area book_hotel votes location hotel_type Total_Price Facilities Dine rate
6 0 0 1 163 400 22 7 4.4
19 1 2 7 122 220 28 11 4.6
X=df.drop(['rate'],axis=1)
Y=df['rate']
x_train, x_test, y_train, y_test = train_test_split(np.asarray(X), np.asarray(Y), test_size=0.33, shuffle= True)
x_train has shape (3350,10) and
x_test has shape (1650, 10)
# The known number of output classes.
num_classes = 10
# Input image dimensions
input_shape = (10,)
# Convert class vectors to binary class matrices. This uses 1 hot encoding.
y_train_binary = keras.utils.to_categorical(y_train, num_classes)
y_test_binary = keras.utils.to_categorical(y_test, num_classes)
x_train = x_train.reshape(3350, 10,1)
x_test = x_test.reshape(1650, 10,1)
input_layer = Input(shape=(10, 1))
conv1 = Conv1D(filters=32,
kernel_size=8,
strides=1,
activation='relu',
padding='same')(input_layer)
lstm1 = LSTM(32, return_sequences=True)(conv1)
output_layer = Dense(1, activation='sigmoid')(lstm1)
model = Model(inputs=input_layer, outputs=output_layer)
model.summary()
model.compile(loss='mse',optimizer='adam')
Finally when i am trying to fit the model with input
model.fit(x_train,y_train)
ValueError Traceback (most recent call last)
<ipython-input-170-4719cf73997a> in <module>()
----> 1 model.fit(x_train,y_train)
2 frames
/usr/local/lib/python3.6/dist-packages/keras/engine/training_utils.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
133 ': expected ' + names[i] + ' to have ' +
134 str(len(shape)) + ' dimensions, but got array '
--> 135 'with shape ' + str(data_shape))
136 if not check_batch_axis:
137 data_shape = data_shape[1:]
ValueError: Error when checking target: expected dense_2 to have 3 dimensions, but got array with shape (3350, 1)
Can someone please help me resolving this error
I see some problem in your code...
the last dimension output must be equal to the number of class and with multiclass tasks you need to apply a softmax activation: Dense(num_classes, activation='softmax')
you must set return_sequences=False in your last lstm cell because you need a 2D output and not a 3D
you must use categorical_crossentropy as loss function with one-hot encoded target
here a complete dummy example...
num_classes = 10
n_sample = 1000
X = np.random.uniform(0,1, (n_sample,10,1))
y = tf.keras.utils.to_categorical(np.random.randint(0,num_classes, n_sample))
input_layer = Input(shape=(10, 1))
conv1 = Conv1D(filters=32,
kernel_size=8,
strides=1,
activation='relu',
padding='same')(input_layer)
lstm1 = LSTM(32, return_sequences=False)(conv1)
output_layer = Dense(num_classes, activation='softmax')(lstm1)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy',optimizer='adam')
model.fit(X,y, epochs=5)

Simple classification using scikit-learn not working

This is the code that I used to solve a classification problem pertaining to credit card fraud detection:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv(r'C:\Users\SVISHWANATH\Downloads\creditcard.csv')
f = df.drop(['Class'], axis = 1)
g = df.Class
g.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(f, g, stratify = g)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
knn.predict(y_test)
For some reason, even if I specify the reshape parameter, the above code is resulting in an error. This is the error:
ValueError Traceback (most recent call last)
<ipython-input-37-d24a7d3e9bd3> in <module>
12 knn = KNeighborsClassifier(n_neighbors = 5)
13 knn.fit(X_train, y_train)
---> 14 knn.predict(y_test)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py in predict(self, X)
171 Class labels for each data sample.
172 """
--> 173 X = check_array(X, accept_sparse='csr')
174
175 neigh_dist, neigh_ind = self.kneighbors(X)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
622 "Reshape your data either using array.reshape(-1, 1) if "
623 "your data has a single feature or array.reshape(1, -1) "
--> 624 "if it contains a single sample.".format(array))
625
626 # in the future np.flexible dtypes will be handled like object dtypes
ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
y_test are the results you're trying to predict (i.e. classes). You need to predict from the available data, i.e. data you would have when trying to classify, which would be everything else except the classes: in your case that is X_test, so you need to change knn.predict(y_test) to knn.predict(X_test). You can then use y_test to compare your predictions and see how accurate they are.

Error in making train - test sets from iris data by sklearn.train_test_split()

I'm trying to use simple command: train_test_split on iris dataset and use svm for prediction but when I use "fit" as follows:
dat_iris = datasets.load_iris()
x1 = dat_iris.data[:,2]
y1 = dat_iris.target
x_train,y_train,x_test,y_test = train_test_split(x1, y1, test_size = 0.3,
random_state=0)
svm_model = SVC(kernel='linear',C=1.0, random_state=0)
svm_model.fit(x_train,y_train)
y_pred = svm_model.predict(x_train)
but the following error appears:
ValueError Traceback (most recent call last)
<ipython-input-245-120527f222b3> in <module>()
7
8 svm_model = SVC(kernel='linear',C=1.0, random_state=0)
----> 9 svm_model.fit(x_train,y_train)
10 y_pred = svm_model.predict(x_train)
11 metrics.classification_report(y_pred, y_train)
~/anaconda3/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
147 self._sparse = sparse and not callable(self.kernel)
148
--> 149 X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
150 y = self._validate_targets(y)
151
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
550 y = y.astype(np.float64)
551
--> 552 check_consistent_length(X, y)
553
554 return X, y
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
171 if len(uniques) > 1:
172 raise ValueError("Found input variables with inconsistent numbers of"
--> 173 " samples: %r" % [int(l) for l in lengths])
174
175
ValueError: Found input variables with inconsistent numbers of samples: [105, 45]
This may arise because of the size of the target or inputs, how can I resolve this problem?
You mixed the order of the return arguments.
It should be:
X_train, X_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.3,
random_state=0)

Resources