ValueError: Input X contains NaN - machine-learning

I'm training to classify my traffic using SVM ML..as below
import pandas as pd # for process the DataSet
import matplotlib.pyplot as plt
ds= pd.read_csv("dataset_sdn.csv") # to read the dataset with name (ds)
ds.fillna(0)
ds #
ds output
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
print (X)
print (Y)
X output
Y output
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split (X, Y, test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)
from sklearn.svm import SVC
classifier = SVC (kernel='linear', random_state=0)
classifier.fit(X_Train, Y_Train)
Y_pred = classifier.predict(X_Test)
here in this last step i get error message
ValueError Traceback (most recent call
last) Input In [43], in <cell line: 3>()
1 from sklearn.svm import SVC
2 classifier = SVC (kernel='linear', random_state=0)
----> 3 classifier.fit(X_Train, Y_Train)
5 # The output predect
6 Y_pred = classifier.predict(X_Test)
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm_base.py:173,
in BaseLibSVM.fit(self, X, y, sample_weight)
171 check_consistent_length(X, y)
172 else:
--> 173 X, y = self._validate_data(
174 X,
175 y,
176 dtype=np.float64,
177 order="C",
178 accept_sparse="csr",
179 accept_large_sparse=False,
180 )
182 y = self._validate_targets(y)
184 sample_weight = np.asarray(
185 [] if sample_weight is None else sample_weight, dtype=np.float64
186 )
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:596,
in BaseEstimator._validate_data(self, X, y, reset,
validate_separately, **check_params)
594 y = check_array(y, input_name="y", **check_y_params)
595 else:
--> 596 X, y = check_X_y(X, y, **check_params)
597 out = X, y
599 if not no_val_X and check_params.get("ensure_2d", True):
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:1074,
in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order,
copy, force_all_finite, ensure_2d, allow_nd, multi_output,
ensure_min_samples, ensure_min_features, y_numeric, estimator) 1069
estimator_name = _check_estimator_name(estimator) 1070 raise
ValueError( 1071 f"{estimator_name} requires y to be
passed, but the target y is None" 1072 )
-> 1074 X = check_array( 1075 X, 1076 accept_sparse=accept_sparse, 1077
accept_large_sparse=accept_large_sparse, 1078 dtype=dtype,
1079 order=order, 1080 copy=copy, 1081
force_all_finite=force_all_finite, 1082 ensure_2d=ensure_2d,
1083 allow_nd=allow_nd, 1084
ensure_min_samples=ensure_min_samples, 1085
ensure_min_features=ensure_min_features, 1086
estimator=estimator, 1087 input_name="X", 1088 ) 1090 y =
_check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1092 check_consistent_length(X, y)
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:899,
in check_array(array, accept_sparse, accept_large_sparse, dtype,
order, copy, force_all_finite, ensure_2d, allow_nd,
ensure_min_samples, ensure_min_features, estimator, input_name)
893 raise ValueError(
894 "Found array with dim %d. %s expected <= 2."
895 % (array.ndim, estimator_name)
896 )
898 if force_all_finite:
--> 899 _assert_all_finite(
900 array,
901 input_name=input_name,
902 estimator_name=estimator_name,
903 allow_nan=force_all_finite == "allow-nan",
904 )
906 if ensure_min_samples > 0:
907 n_samples = _num_samples(array)
File
~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:146,
in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name,
input_name)
124 if (
125 not allow_nan
126 and estimator_name (...)
130 # Improve the error message on how to handle missing values in
131 # scikit-learn.
132 msg_err += (
133 f"\n{estimator_name} does not accept missing values"
134 " encoded as NaN natively. For supervised learning, you might want" (...)
144 "#estimators-that-handle-nan-values"
145 )
--> 146 raise ValueError(msg_err)
148 # for object dtype data, we only check for NaNs (GH-13254)
149 elif X.dtype == np.dtype("object") and not allow_nan:
ValueError: Input X contains NaN. SVC does not accept missing values
encoded as NaN natively. For supervised learning, you might want to
consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor
which accept missing values encoded as NaNs natively. Alternatively,
it is possible to preprocess the data, for instance by using an
imputer transformer in a pipeline or drop samples with missing values.
See https://scikit-learn.org/stable/modules/impute.html You can find a
list of all estimators that handle NaN values at the following page:
https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
So, plz any advice to solve this error, although there isn't any NaN value in the dataset

You are not replacing old dataframe with new dataframe.
Use this:
ds = ds.fillna(0)
OR
ds.fillna(0, inplace=True)

Related

show the visualization of the dataset before and after classification

Kindly, I am trying to create an ML model using SVM using a dataset with 23 features and the output should be (0 or 1) which means two classes. My Target to show the visualization before and after the classification.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
ds= pd.read_csv("dataset_sdn.csv")
ds= ds.fillna(0)
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
X_Train, X_Test, Y_Train, Y_Test = train_test_split (X, Y, test_size=0.25, random_state=0)
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)
Then i tried to visualization the X and Y
import matplotlib.pyplot as plt
plt.scatter(X,Y)
plt.show()
But i got error
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [11], in <cell line: 2>()
1 import matplotlib.pyplot as plt
----> 2 plt.scatter(X,Y)
3 plt.show()
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\pyplot.py:2817, in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, data, **kwargs) 2812 #_copy_docstring_and_deprecators(Axes.scatter) 2813 def scatter( 2814 x, y, s=None, c=None, marker=None, cmap=None, norm=None, 2815 vmin=None, vmax=None, alpha=None, linewidths=None, *, 2816 edgecolors=None, plotnonfinite=False, data=None,
**kwargs):
-> 2817 __ret = gca().scatter( 2818 x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm, 2819 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths, 2820 edgecolors=edgecolors, plotnonfinite=plotnonfinite, 2821
**({"data": data} if data is not None else {}), **kwargs) 2822 sci(__ret) 2823 return __ret
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\__init__.py:1414, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs) 1411 #functools.wraps(func) 1412 def inner(ax, *args, data=None,
**kwargs): 1413 if data is None:
-> 1414 return func(ax, *map(sanitize_sequence, args), **kwargs) 1416 bound = new_sig.bind(ax, *args, **kwargs) 1417 auto_label = (bound.arguments.get(label_namer) 1418 or bound.kwargs.get(label_namer))
File ~\AppData\Roaming\Python\Python310\site-packages\matplotlib\axes\_axes.py:4368, in Axes.scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, **kwargs) 4366 y = np.ma.ravel(y) 4367 if x.size != y.size:
-> 4368 raise ValueError("x and y must be the same size") 4370 if s is None: 4371 s = (20 if rcParams['_internal.classic_mode'] else 4372 rcParams['lines.markersize'] ** 2.0)
ValueError: x and y must be the same size
Then i start the classification
classifier = SVC (kernel='rbf', C=1, random_state=0,)
classifier.fit(X_Train, Y_Train)
Y_pred = classifier.predict(X_Test)
print (Y_pred)
The I tried to use below code but not work
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X=X_Test, y=Y_Test, clf=classifier, legend=1)
I get this error
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [10], in <cell line: 2>()
1 from mlxtend.plotting import plot_decision_regions
----> 2 plot_decision_regions(X=Y_Test, y=X_Test, clf=classifier,legend=2)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\plotting\decision_regions.py:148, in plot_decision_regions(X, y, clf, feature_index, filler_feature_values, filler_feature_ranges, ax, X_highlight, zoom_factor, legend, hide_spines, markers, colors, scatter_kwargs, contourf_kwargs, contour_kwargs, scatter_highlight_kwargs)
44 def plot_decision_regions(
45 X,
46 y, (...)
65 scatter_highlight_kwargs=None,
66 ):
67 """Plot decision regions of a classifier.
68
69 Please note that this functions assumes that class labels are (...)
145
146 """
--> 148 check_Xy(X, y, y_int=True) # Validate X and y arrays
149 dim = X.shape[1]
151 if ax is None:
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\mlxtend\utils\checking.py:16, in check_Xy(X, y, y_int)
12 def check_Xy(X, y, y_int=True):
13
14 # check types
15 if not isinstance(X, np.ndarray):
---> 16 raise ValueError("X must be a NumPy array. Found %s" % type(X))
17 if not isinstance(y, np.ndarray):
18 raise ValueError("y must be a NumPy array. Found %s" % type(y))
ValueError: X must be a NumPy array. Found <class 'pandas.core.series.Series'>
So, please your advice to how show the visualization. Thanks in advance
See here:
X = ds.iloc[: , [4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]] # Input Features
Y = ds.iloc[:, 22] # OutPut
and then here
plt.scatter(X,Y)
Your X vector is multi-dimensional vector having a size of 16 and hence the scatterplot is not working. You can only plot 2 variables in a 2D scatterplot.
So you have plot for each feature vs Y, e.g.,
plt.scatter(X.iloc[:,0], Y)

Decision Tree - Exporting image via Graphviz error

I'm trying to build a Decision Tree using gridsearch and a pipeline, but I get an error when I try to export the image using graphviz. I looked online and couldn't find anything; one potential problem would've been if I didn't use the best_estimator_ instance, but I did in this case.
Everything works (getting accuracy and other metrics) except the exporting graph part.
def TreeOpt(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
std_scl = StandardScaler()
dec_tree = tree.DecisionTreeClassifier()
pipe = Pipeline(steps=[('std_slc', std_scl),
('dec_tree', dec_tree)])
criterion = ['gini', 'entropy']
max_depth = list(range(1,15))
parameters = dict(dec_tree__criterion=criterion,
dec_tree__max_depth=max_depth)
tree_gs = GridSearchCV(pipe, parameters)
tree_gs.fit(X_train, y_train)
export_graphviz(
tree_gs.best_estimator_,
out_file=("dec_tree.dot"),
feature_names=None,
class_names=None,
filled=True)
But I get
<ipython-input-2-bb91ec6ba0d9> in <module>
37 filled=True)
38
---> 39 DecTreeOptimizer(X = df.drop(['quality'], axis=1), y = df.quality)
40
<ipython-input-2-bb91ec6ba0d9> in DecTreeOptimizer(X, y)
30 print("Best score: " + str(tree_GS.best_score_))
31
---> 32 export_graphviz(
33 tree_GS.best_estimator_,
34 out_file=("dec_tree.dot"),
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_export.py in export_graphviz(decision_tree, out_file, max_depth, feature_names, class_names, label, filled, leaves_parallel, impurity, node_ids, proportion, rotate, rounded, special_characters, precision)
767 """
768
--> 769 check_is_fitted(decision_tree)
770 own_file = False
771 return_string = False
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
1096
1097 if not attrs:
-> 1098 raise NotFittedError(msg % {'name': type(estimator).__name__})
1099
1100
NotFittedError: This Pipeline instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.```
After long searches, finally found the answer here :Plot best decision tree with pipeline and GridsearchCV
The best_estimator_ attribute returns a pipeline instead of an object, so I just had to query it like this: best_estimator_[1] (and then I found a whole other lot of problems with my code, but that's part 2).
I will leave this here in case anyone else is going to need it. Cheers!

ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input

I am a beginner please can someone tell me where I made a mistake in this code
The data set used is kaggle tiatanic
Error is show in 9th cell rest run fine on there own
In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
In [2]:
train_data.dtypes
In [3]:
train_data.isna().sum()
In [4]:
train_data = train_data.fillna(value = {'Age' :0, 'Embarked' :'u'})
In [5]:
train_data.isna().sum()
In [6]:
train_data.shape
In [7]:
test_data = test_data.fillna(value = {'Age' :0, 'Fare' :0})
In [8]:
test_data.shape
In [9]:as in this cell I have specified the features to be used still why it's saying classifier expects 11 features
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
error trace back
ValueError Traceback (most recent call last) <ipython-input-11-a7ceba9b896f> in <module>
7 model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=1)
8 model.fit(X, y)
----> 9 predictions = model.predict(X_test)
10
11 output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in predict(self, X)
628 The predicted classes.
629 """
--> 630 proba = self.predict_proba(X)
631
632 if self.n_outputs_ == 1:
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in predict_proba(self, X)
672 check_is_fitted(self)
673 # Check data
--> 674 X = self._validate_X_predict(X)
675
676 # Assign chunk of trees to jobs
c:\python39\lib\site-packages\sklearn\ensemble\_forest.py in
_validate_X_predict(self, X)
420 check_is_fitted(self)
421
--> 422 return self.estimators_[0]._validate_X_predict(X, check_input=True)
423
424 #property
c:\python39\lib\site-packages\sklearn\tree\_classes.py in
_validate_X_predict(self, X, check_input)
405 """Validate the training data on predict (probabilities)."""
406 if check_input:
--> 407 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr",
408 reset=False)
409 if issparse(X) and (X.indices.dtype != np.intc or
c:\python39\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
435
436 if check_params.get('ensure_2d', True):
--> 437 self._check_n_features(X, reset=reset)
438
439 return out
c:\python39\lib\site-packages\sklearn\base.py in
_check_n_features(self, X, reset)
363
364 if n_features != self.n_features_in_:
--> 365 raise ValueError(
366 f"X has {n_features} features, but {self.__class__.__name__} "
367 f"is expecting {self.n_features_in_} features as input.")
ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input
You don't have the same number of features in your train set and in your test set because you use the function pd.get_dummies() on the train set and on the test set separately. You have a value that is in your test set that is not in your train set.
To solve this issue, the best way is to use the function OneHotEncoder() in the module sklearn.preprocessing with the parameter handle_unknown="ignore" :
from sklearn.preprocessing import OneHotEncoder
oneh = OneHotEncoder(handle_unknown="ignore")
oneh.fit(train_data[features])
X_test = oneh.transform(test_data[features])
Moreover, it is not a good choice to have a different preprocessing workflow for the train and the test set (fillna() in your case).

Simple classification using scikit-learn not working

This is the code that I used to solve a classification problem pertaining to credit card fraud detection:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv(r'C:\Users\SVISHWANATH\Downloads\creditcard.csv')
f = df.drop(['Class'], axis = 1)
g = df.Class
g.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(f, g, stratify = g)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
knn.predict(y_test)
For some reason, even if I specify the reshape parameter, the above code is resulting in an error. This is the error:
ValueError Traceback (most recent call last)
<ipython-input-37-d24a7d3e9bd3> in <module>
12 knn = KNeighborsClassifier(n_neighbors = 5)
13 knn.fit(X_train, y_train)
---> 14 knn.predict(y_test)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py in predict(self, X)
171 Class labels for each data sample.
172 """
--> 173 X = check_array(X, accept_sparse='csr')
174
175 neigh_dist, neigh_ind = self.kneighbors(X)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
622 "Reshape your data either using array.reshape(-1, 1) if "
623 "your data has a single feature or array.reshape(1, -1) "
--> 624 "if it contains a single sample.".format(array))
625
626 # in the future np.flexible dtypes will be handled like object dtypes
ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
y_test are the results you're trying to predict (i.e. classes). You need to predict from the available data, i.e. data you would have when trying to classify, which would be everything else except the classes: in your case that is X_test, so you need to change knn.predict(y_test) to knn.predict(X_test). You can then use y_test to compare your predictions and see how accurate they are.

Error in making train - test sets from iris data by sklearn.train_test_split()

I'm trying to use simple command: train_test_split on iris dataset and use svm for prediction but when I use "fit" as follows:
dat_iris = datasets.load_iris()
x1 = dat_iris.data[:,2]
y1 = dat_iris.target
x_train,y_train,x_test,y_test = train_test_split(x1, y1, test_size = 0.3,
random_state=0)
svm_model = SVC(kernel='linear',C=1.0, random_state=0)
svm_model.fit(x_train,y_train)
y_pred = svm_model.predict(x_train)
but the following error appears:
ValueError Traceback (most recent call last)
<ipython-input-245-120527f222b3> in <module>()
7
8 svm_model = SVC(kernel='linear',C=1.0, random_state=0)
----> 9 svm_model.fit(x_train,y_train)
10 y_pred = svm_model.predict(x_train)
11 metrics.classification_report(y_pred, y_train)
~/anaconda3/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
147 self._sparse = sparse and not callable(self.kernel)
148
--> 149 X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
150 y = self._validate_targets(y)
151
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
550 y = y.astype(np.float64)
551
--> 552 check_consistent_length(X, y)
553
554 return X, y
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
171 if len(uniques) > 1:
172 raise ValueError("Found input variables with inconsistent numbers of"
--> 173 " samples: %r" % [int(l) for l in lengths])
174
175
ValueError: Found input variables with inconsistent numbers of samples: [105, 45]
This may arise because of the size of the target or inputs, how can I resolve this problem?
You mixed the order of the return arguments.
It should be:
X_train, X_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.3,
random_state=0)

Resources