show the visualization of the dataset before and after classification - machine-learning

Kindly, I am trying to create an ML model using SVM using a dataset with 23 features and the output should be (0 or 1) which means two classes. My Target to show the visualization before and after the classification.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
ds= pd.read_csv("dataset_sdn.csv")
ds= ds.fillna(0)
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
X_Train, X_Test, Y_Train, Y_Test = train_test_split (X, Y, test_size=0.25, random_state=0)
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)
Then i tried to visualization the X and Y
import matplotlib.pyplot as plt
plt.scatter(X,Y)
plt.show()
But i got error
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [11], in <cell line: 2>()
1 import matplotlib.pyplot as plt
----> 2 plt.scatter(X,Y)
3 plt.show()
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\pyplot.py:2817, in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, data, **kwargs) 2812 #_copy_docstring_and_deprecators(Axes.scatter) 2813 def scatter( 2814 x, y, s=None, c=None, marker=None, cmap=None, norm=None, 2815 vmin=None, vmax=None, alpha=None, linewidths=None, *, 2816 edgecolors=None, plotnonfinite=False, data=None,
**kwargs):
-> 2817 __ret = gca().scatter( 2818 x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm, 2819 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths, 2820 edgecolors=edgecolors, plotnonfinite=plotnonfinite, 2821
**({"data": data} if data is not None else {}), **kwargs) 2822 sci(__ret) 2823 return __ret
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\__init__.py:1414, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs) 1411 #functools.wraps(func) 1412 def inner(ax, *args, data=None,
**kwargs): 1413 if data is None:
-> 1414 return func(ax, *map(sanitize_sequence, args), **kwargs) 1416 bound = new_sig.bind(ax, *args, **kwargs) 1417 auto_label = (bound.arguments.get(label_namer) 1418 or bound.kwargs.get(label_namer))
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\axes\_axes.py:4368, in Axes.scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, **kwargs) 4366 y = np.ma.ravel(y) 4367 if x.size != y.size:
-> 4368 raise ValueError("x and y must be the same size") 4370 if s is None: 4371 s = (20 if rcParams['_internal.classic_mode'] else 4372 rcParams['lines.markersize'] ** 2.0)
ValueError: x and y must be the same size
Then i start the classification
classifier = SVC (kernel='rbf', C=1, random_state=0,)
classifier.fit(X_Train, Y_Train)
Y_pred = classifier.predict(X_Test)
print (Y_pred)
The I tried to use below code but not work
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X=X_Test, y=Y_Test, clf=classifier, legend=1)
I get this error
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [10], in <cell line: 2>()
1 from mlxtend.plotting import plot_decision_regions
----> 2 plot_decision_regions(X=Y_Test, y=X_Test, clf=classifier,legend=2)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\plotting\decision_regions.py:148, in plot_decision_regions(X, y, clf, feature_index, filler_feature_values, filler_feature_ranges, ax, X_highlight, zoom_factor, legend, hide_spines, markers, colors, scatter_kwargs, contourf_kwargs, contour_kwargs, scatter_highlight_kwargs)
44 def plot_decision_regions(
45 X,
46 y, (...)
65 scatter_highlight_kwargs=None,
66 ):
67 """Plot decision regions of a classifier.
68
69 Please note that this functions assumes that class labels are (...)
145
146 """
--> 148 check_Xy(X, y, y_int=True) # Validate X and y arrays
149 dim = X.shape[1]
151 if ax is None:
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\utils\checking.py:16, in check_Xy(X, y, y_int)
12 def check_Xy(X, y, y_int=True):
13
14 # check types
15 if not isinstance(X, np.ndarray):
---> 16 raise ValueError("X must be a NumPy array. Found %s" % type(X))
17 if not isinstance(y, np.ndarray):
18 raise ValueError("y must be a NumPy array. Found %s" % type(y))
ValueError: X must be a NumPy array. Found <class 'pandas.core.series.Series'>
So, please your advice to how show the visualization. Thanks in advance

See here:
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
and then here
plt.scatter(X,Y)
Your X vector is multi-dimensional vector having a size of 16 and hence the scatterplot is not working. You can only plot 2 variables in a 2D scatterplot.
So you have plot for each feature vs Y, e.g.,
plt.scatter(X.iloc[:,0], Y)

Related

ValueError: Input X contains NaN

I'm training to classify my traffic using SVM ML..as below
import pandas as pd # for process the DataSet
import matplotlib.pyplot as plt
ds= pd.read_csv("dataset_sdn.csv") # to read the dataset with name (ds)
ds.fillna(0)
ds #
ds output
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
print (X)
print (Y)
X output
Y output
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split (X, Y, test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)
from sklearn.svm import SVC
classifier = SVC (kernel='linear', random_state=0)
classifier.fit(X_Train, Y_Train)
Y_pred = classifier.predict(X_Test)
here in this last step i get error message
ValueError Traceback (most recent call
last) Input In [43], in <cell line: 3>()
1 from sklearn.svm import SVC
2 classifier = SVC (kernel='linear', random_state=0)
----> 3 classifier.fit(X_Train, Y_Train)
5 # The output predect
6 Y_pred = classifier.predict(X_Test)
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm_base.py:173,
in BaseLibSVM.fit(self, X, y, sample_weight)
171 check_consistent_length(X, y)
172 else:
--> 173 X, y = self._validate_data(
174 X,
175 y,
176 dtype=np.float64,
177 order="C",
178 accept_sparse="csr",
179 accept_large_sparse=False,
180 )
182 y = self._validate_targets(y)
184 sample_weight = np.asarray(
185 [] if sample_weight is None else sample_weight, dtype=np.float64
186 )
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:596,
in BaseEstimator._validate_data(self, X, y, reset,
validate_separately, **check_params)
594 y = check_array(y, input_name="y", **check_y_params)
595 else:
--> 596 X, y = check_X_y(X, y, **check_params)
597 out = X, y
599 if not no_val_X and check_params.get("ensure_2d", True):
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:1074,
in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order,
copy, force_all_finite, ensure_2d, allow_nd, multi_output,
ensure_min_samples, ensure_min_features, y_numeric, estimator) 1069
estimator_name = _check_estimator_name(estimator) 1070 raise
ValueError( 1071 f"{estimator_name} requires y to be
passed, but the target y is None" 1072 )
-> 1074 X = check_array( 1075 X, 1076 accept_sparse=accept_sparse, 1077
accept_large_sparse=accept_large_sparse, 1078 dtype=dtype,
1079 order=order, 1080 copy=copy, 1081
force_all_finite=force_all_finite, 1082 ensure_2d=ensure_2d,
1083 allow_nd=allow_nd, 1084
ensure_min_samples=ensure_min_samples, 1085
ensure_min_features=ensure_min_features, 1086
estimator=estimator, 1087 input_name="X", 1088 ) 1090 y =
_check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1092 check_consistent_length(X, y)
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:899,
in check_array(array, accept_sparse, accept_large_sparse, dtype,
order, copy, force_all_finite, ensure_2d, allow_nd,
ensure_min_samples, ensure_min_features, estimator, input_name)
893 raise ValueError(
894 "Found array with dim %d. %s expected <= 2."
895 % (array.ndim, estimator_name)
896 )
898 if force_all_finite:
--> 899 _assert_all_finite(
900 array,
901 input_name=input_name,
902 estimator_name=estimator_name,
903 allow_nan=force_all_finite == "allow-nan",
904 )
906 if ensure_min_samples > 0:
907 n_samples = _num_samples(array)
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:146,
in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name,
input_name)
124 if (
125 not allow_nan
126 and estimator_name (...)
130 # Improve the error message on how to handle missing values in
131 # scikit-learn.
132 msg_err += (
133 f"\n{estimator_name} does not accept missing values"
134 " encoded as NaN natively. For supervised learning, you might want" (...)
144 "#estimators-that-handle-nan-values"
145 )
--> 146 raise ValueError(msg_err)
148 # for object dtype data, we only check for NaNs (GH-13254)
149 elif X.dtype == np.dtype("object") and not allow_nan:
ValueError: Input X contains NaN. SVC does not accept missing values
encoded as NaN natively. For supervised learning, you might want to
consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor
which accept missing values encoded as NaNs natively. Alternatively,
it is possible to preprocess the data, for instance by using an
imputer transformer in a pipeline or drop samples with missing values.
See https://scikit-learn.org/stable/modules/impute.html You can find a
list of all estimators that handle NaN values at the following page:
https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
So, plz any advice to solve this error, although there isn't any NaN value in the dataset
You are not replacing old dataframe with new dataframe.
Use this:
ds = ds.fillna(0)
OR
ds.fillna(0, inplace=True)

ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input

I am a beginner please can someone tell me where I made a mistake in this code
The data set used is kaggle tiatanic
Error is show in 9th cell rest run fine on there own
In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
In [2]:
train_data.dtypes
In [3]:
train_data.isna().sum()
In [4]:
train_data = train_data.fillna(value = {'Age' :0, 'Embarked' :'u'})
In [5]:
train_data.isna().sum()
In [6]:
train_data.shape
In [7]:
test_data = test_data.fillna(value = {'Age' :0, 'Fare' :0})
In [8]:
test_data.shape
In [9]:as in this cell I have specified the features to be used still why it's saying classifier expects 11 features
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
error trace back
ValueError Traceback (most recent call last) <ipython-input-11-a7ceba9b896f> in <module>
7 model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=1)
8 model.fit(X, y)
----> 9 predictions = model.predict(X_test)
10
11 output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in predict(self, X)
628 The predicted classes.
629 """
--> 630 proba = self.predict_proba(X)
631
632 if self.n_outputs_ == 1:
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in predict_proba(self, X)
672 check_is_fitted(self)
673 # Check data
--> 674 X = self._validate_X_predict(X)
675
676 # Assign chunk of trees to jobs
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in
_validate_X_predict(self, X)
420 check_is_fitted(self)
421
--> 422 return self.estimators_[0]._validate_X_predict(X, check_input=True)
423
424 #property
c:\python39\lib\site-packages\sklearn\tree\_classes.py in
_validate_X_predict(self, X, check_input)
405 """Validate the training data on predict (probabilities)."""
406 if check_input:
--> 407 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr",
408 reset=False)
409 if issparse(X) and (X.indices.dtype != np.intc or
c:\python39\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
435
436 if check_params.get('ensure_2d', True):
--> 437 self._check_n_features(X, reset=reset)
438
439 return out
c:\python39\lib\site-packages\sklearn\base.py in
_check_n_features(self, X, reset)
363
364 if n_features != self.n_features_in_:
--> 365 raise ValueError(
366 f"X has {n_features} features, but {self.__class__.__name__} "
367 f"is expecting {self.n_features_in_} features as input.")
ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input
You don't have the same number of features in your train set and in your test set because you use the function pd.get_dummies() on the train set and on the test set separately. You have a value that is in your test set that is not in your train set.
To solve this issue, the best way is to use the function OneHotEncoder() in the module sklearn.preprocessing with the parameter handle_unknown="ignore" :
from sklearn.preprocessing import OneHotEncoder
oneh = OneHotEncoder(handle_unknown="ignore")
oneh.fit(train_data[features])
X_test = oneh.transform(test_data[features])
Moreover, it is not a good choice to have a different preprocessing workflow for the train and the test set (fillna() in your case).

Simple classification using scikit-learn not working

This is the code that I used to solve a classification problem pertaining to credit card fraud detection:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv(r'C:\Users\SVISHWANATH\Downloads\creditcard.csv')
f = df.drop(['Class'], axis = 1)
g = df.Class
g.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(f, g, stratify = g)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
knn.predict(y_test)
For some reason, even if I specify the reshape parameter, the above code is resulting in an error. This is the error:
ValueError Traceback (most recent call last)
<ipython-input-37-d24a7d3e9bd3> in <module>
12 knn = KNeighborsClassifier(n_neighbors = 5)
13 knn.fit(X_train, y_train)
---> 14 knn.predict(y_test)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py in predict(self, X)
171 Class labels for each data sample.
172 """
--> 173 X = check_array(X, accept_sparse='csr')
174
175 neigh_dist, neigh_ind = self.kneighbors(X)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
622 "Reshape your data either using array.reshape(-1, 1) if "
623 "your data has a single feature or array.reshape(1, -1) "
--> 624 "if it contains a single sample.".format(array))
625
626 # in the future np.flexible dtypes will be handled like object dtypes
ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
y_test are the results you're trying to predict (i.e. classes). You need to predict from the available data, i.e. data you would have when trying to classify, which would be everything else except the classes: in your case that is X_test, so you need to change knn.predict(y_test) to knn.predict(X_test). You can then use y_test to compare your predictions and see how accurate they are.

How to use sklearn.inspection.permutation_importance for clustering algorithm

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
X, y = make_classification(n_samples=1000,
n_features=4,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
km = KMeans(n_clusters=3).fit(X)
result = permutation_importance(km, X, y, scoring='homogeneity_score', n_repeats=10, random_state=0, n_jobs=-1)
result
In the real problem, I don't have y (true label), I tried to do y=None to make it as an unsupervised learning. But it does not work. I got:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-72-81045ae9cb66> in <module>()
----> 1 result = permutation_importance(km, X, y=None, scoring='homogeneity_score', n_repeats=10, random_state=0, n_jobs=-1)
5 frames
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/cluster/_supervised.py in check_clusterings(labels_true, labels_pred)
53 if labels_true.ndim != 1:
54 raise ValueError(
---> 55 "labels_true must be 1D: shape is %r" % (labels_true.shape,))
56 if labels_pred.ndim != 1:
57 raise ValueError(
ValueError: labels_true must be 1D: shape is ()
Do anyone know how to implement without true label?
First of all, it is trivial to prove that k-means is invariant to a permutation of features... Because the sum is permutation invariant.
If you still want to experiment, try using an array of 0s as y maybe?

index 400 is out of bounds for axis 1 with size 368

im trying to build a face recognition model using CNN i have an image dataset that consists of 368 classes each class contains 15 images . im trying to use keras.utils.to_categorical on the image labels(class names which are numbers from 1-368 ) to set a specific label to 1 while set the others to zero during training.here is what i have done so far
import cv2
import numpy as np
import os
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from pathlib import Path
from sklearn.model_selection import train_test_split
SEED = 44000
data = []
label = []
BASE_FOLDER = r'C:\Users\Desktop\images\\'
folders = os.listdir(BASE_FOLDER)
for folder in folders:
for file in os.listdir(BASE_FOLDER + folder + '//'):
img = cv2.imread(BASE_FOLDER + folder + '//' + file)
data.append(img)
label.append(folder)
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.1, random_state=SEED)
train_data = np.array(train_data, dtype=np.float32)
test_data = np.array(test_data, dtype=np.float32)
train_data = train_data / 180 # to make the array values between 0-1. image size is 180 X 180
test_data = test_data / 180
train_label = list(map(int, train_label))
train_label = keras.utils.to_categorical(train_label, 368)
but i am getting this error
IndexError Traceback (most recent call last)
<ipython-input-69-f087078ef22a> in <module>
----> 1 train_label = keras.utils.to_categorical(train_label, 368)
~\Miniconda3\envs\tf_gpu\lib\site-packages\keras\utils\np_utils.py in to_categorical(y, num_classes, dtype)
32 n = y.shape[0]
33 categorical = np.zeros((n, num_classes), dtype=dtype)
---> 34 categorical[np.arange(n), y] = 1
35 output_shape = input_shape + (num_classes,)
36 categorical = np.reshape(categorical, output_shape)
IndexError: index 400 is out of bounds for axis 1 with size 368
whats the meaning of this error and how to solve it ?

Resources