Getting error while trying to fit to GridSearchCV - machine-learning

I am trying to fit a ridge regression model to my data using a pipeline and GridSearchCV.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
X = transformed_data.iloc[:, :-1]
y = transformed_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
params = {}
params['ridge__alpha'] = np.arange(0, 100, 1).tolist()
t = [('labelenc',LabelEncoder() , [0]), ('stand', StandardScaler(), [1,2,3,4,5,6]), ('poly'),PolynomialFeatures(degree=2),[1,2,3,4,5,6] ]
transformer = ColumnTransformer(transformers=t)
pipe = Pipeline(steps=[('t', transformer), ('m',Ridge())])
#grid_ridge2_r2 = GridSearchCV(pipe, params, cv=10, scoring='r2', n_jobs=-1)
#results_ridge2_r2 = grid_ridge2_r2.fit(X_train,y_train)
grid_ridge2_rmse = GridSearchCV(pipe, params, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1)
results_ridge2_rmse = grid_ridge2_rmse.fit(X_train,y_train)
I keep getting
ValueError: too many values to unpack (expected 3)
in the last line grid_ridge2_rmse.fit(X_train,y_train). My intuition is that there is something wrong with how I am splitting the dataset.

There is a few error within your pipeline.
First LabelEncoder cannot be used inside a scikit-learn pipeline as it is used to modify y not X. Assuming that you want to encode a categorical value of your feature it should be replaced by OrdinalEncoder.
Then, to set the grid parameter it has to be named with the following name convention <step>__<hyperparameter. Setting the ridge parameter in your case should be m__alpha.
The pipeline parameters can be seen using pipe.get_params().
I would do as follows:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
params = {'m__alpha' : np.arange(0, 100, 1).tolist()}
t = [
('labelenc',OrdinalEncoder() , [0]),
('stand', StandardScaler(), [1,2,3,4,5,6]),
('poly', PolynomialFeatures(degree=2), [1,2,3,4,5,6])
]
transformer = ColumnTransformer(transformers=t)
pipe = Pipeline(steps=[('t', transformer), ('m',Ridge())])
grid_ridge2_rmse = GridSearchCV(pipe, params, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1)
results_ridge2_rmse = grid_ridge2_rmse.fit(X_train,y_train)

Related

Kubeflow Pipeline Training Component Failing | Unknown return type: <class 'inspect._empty'>

I am running an ML pipeline and the training component/step (see code below) continues to fail with the following error: "RuntimeError: Unknown return type: <class 'inspect._empty'>. Must be one of str, int, float, a subclass of Artifact, or a NamedTuple collection of these types."
Any ideas on what might be causing the issue/error and how to resolve it?
Thank you!
RE
#component(
# this component builds an xgboost classifier with xgboost
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml"
)
def build_xgb_xgboost(project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics: Output[Metrics],
model: Output[Model]
):
from google.cloud import bigquery
import xgboost as xgb
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error
import joblib
import pyarrow
import db_dtypes
client = bigquery.Client(project=project)
view_uri = f"{project}.{bq_dataset}.{test_view_name}" #replace view_name with test_view_name
build_df_for_xgboost = '''
SELECT * FROM `{view_uri}`
'''.format(view_uri = view_uri)
job_config = bigquery.QueryJobConfig()
df_1 = client.query(build_df_for_xgboost).to_dataframe()
#client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()
df = df_1.drop(['int64_field_0'], axis=1)
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column], prefix=column)
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
# Binary encoding
df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})
# One-hot encoding
for column in ['attacking_work_rate', 'defensive_work_rate']:
df = onehot_encode(df, column=column)
# Split df into X and y
y = df['overall_rating']
X = df.drop('overall_rating', axis=1)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
#specify parameters
#define your model
bst = XGBRegressor(
objective='reg:linear',
learning_rate = '.1',
alpha = '0.001'
)
#fit your model
bst.fit(X_train, y_train)
# Predict the model
y_pred = bst.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
mae = mean_absolute_error(y_test, y_pred)
metrics.log_metric("RMSE", rmse)
metrics.log_metric("framework", "xgboost")
metrics.log_metric("dataset_size", len(df))
metrics.log_metric("MAE", mae)
dump(bst, model.path + ".joblib")
I think this might just be a bug in the version of KFP v2 SDK code you're using.
I mostly use the stable KFPv1 methods to avoid problems.
from kfp.components import InputPath, OutputPath, create_component_from_func
def train_xgboost_model(
project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics_path: OutputPath(Metrics),
model_path: OutputPath(Model),
):
import json
from pathlib import Path
metrics = {
...
}
Path(metrics_path).write_text(json.dumps(metrics))
dump(bst, model_path)
train_xgboost_model_op = create_component_from_func(
func=train_xgboost_model,
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml",
)
You can also find many examples of real-world components in this repo: https://github.com/Ark-kun/pipeline_components/tree/master/components
including an XGBoost trainer https://github.com/Ark-kun/pipeline_components/blob/d8c4cf5/components/XGBoost/Train/component.py
and a full XGBoost pipeline: https://github.com/Ark-kun/pipeline_components/blob/4f19be6f26eaaf85ba251110d10d103b17e54a17/samples/Google_Cloud_Vertex_AI/Train_tabular_regression_model_using_XGBoost_and_import_to_Vertex_AI/pipeline.py

ValueError when making predictions with LinearRegression

I have started learning ML.
This is my code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Import the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values
# Split the data set into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =\
train_test_split(X, Y, test_size=1/3, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Fitting Simple Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train , Y_train)
# Predicting the Test set Results
y_pred = regressor.predict(X_test)
I am getting the error:
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a
minimum of 1 is required.
for the last line. How to resolve this??

How to predict the outcome of a new patient using this SVC ML model trained on a dataset

Data description:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Alternatively X and y can be obtained directly from the frame attribute:
X = df.drop('OUTCOME', axis=1)
y = df['OUTCOME']
numeric_features = ['Age', 'PCT' , 'CURB 65' , 'pO2' ]
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['Sex', 'CXR' , 'Hospitalisation in last 3 months' , 'ICU>72hrs', 'Blood']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn import linear_model
from sklearn.svm import SVC
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Accuracy of the model is 1.0
How to enter new patient's parameter to get a prediction for that patient? New patient is not from this dataset.
i tried to enter new patient information as follows
New_patient = [70,1,0,2,2,10,40,2,20,68,1,2,0,1,1]
clf.predict([New_patient])
output: ValueError: Specifying the columns using strings is only supported for pandas DataFrames

label_binarize Does not fit for sklearn Naive Bayes classifier showing bad input shape

I was trying to create roc curve for multiclass using Naive Bayes But it ending with
ValueError: bad input shape.
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import BernoulliNB
from scipy import interp
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = BernoulliNB(alpha=1.0, binarize=6, class_prior=None, fit_prior=True)
y_score = classifier.fit(X_train, y_train).predict(X_test)
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (75, 6)
The error because of binarizing the y variable. The estimator can work with string values itself.
Remove the following lines,
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
You are good to go!
To get the predicted probabilities for roc_curve, use the following:
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_score.shape
# (75, 3)

Random Forest on Categorical Data with low accuracy

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)

Resources