Kubeflow Pipeline Training Component Failing | Unknown return type: <class 'inspect._empty'> - kubeflow

I am running an ML pipeline and the training component/step (see code below) continues to fail with the following error: "RuntimeError: Unknown return type: <class 'inspect._empty'>. Must be one of str, int, float, a subclass of Artifact, or a NamedTuple collection of these types."
Any ideas on what might be causing the issue/error and how to resolve it?
Thank you!
RE
#component(
# this component builds an xgboost classifier with xgboost
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml"
)
def build_xgb_xgboost(project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics: Output[Metrics],
model: Output[Model]
):
from google.cloud import bigquery
import xgboost as xgb
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error
import joblib
import pyarrow
import db_dtypes
client = bigquery.Client(project=project)
view_uri = f"{project}.{bq_dataset}.{test_view_name}" #replace view_name with test_view_name
build_df_for_xgboost = '''
SELECT * FROM `{view_uri}`
'''.format(view_uri = view_uri)
job_config = bigquery.QueryJobConfig()
df_1 = client.query(build_df_for_xgboost).to_dataframe()
#client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()
df = df_1.drop(['int64_field_0'], axis=1)
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column], prefix=column)
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
# Binary encoding
df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})
# One-hot encoding
for column in ['attacking_work_rate', 'defensive_work_rate']:
df = onehot_encode(df, column=column)
# Split df into X and y
y = df['overall_rating']
X = df.drop('overall_rating', axis=1)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
#specify parameters
#define your model
bst = XGBRegressor(
objective='reg:linear',
learning_rate = '.1',
alpha = '0.001'
)
#fit your model
bst.fit(X_train, y_train)
# Predict the model
y_pred = bst.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
mae = mean_absolute_error(y_test, y_pred)
metrics.log_metric("RMSE", rmse)
metrics.log_metric("framework", "xgboost")
metrics.log_metric("dataset_size", len(df))
metrics.log_metric("MAE", mae)
dump(bst, model.path + ".joblib")

I think this might just be a bug in the version of KFP v2 SDK code you're using.
I mostly use the stable KFPv1 methods to avoid problems.
from kfp.components import InputPath, OutputPath, create_component_from_func
def train_xgboost_model(
project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics_path: OutputPath(Metrics),
model_path: OutputPath(Model),
):
import json
from pathlib import Path
metrics = {
...
}
Path(metrics_path).write_text(json.dumps(metrics))
dump(bst, model_path)
train_xgboost_model_op = create_component_from_func(
func=train_xgboost_model,
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml",
)
You can also find many examples of real-world components in this repo: https://github.com/Ark-kun/pipeline_components/tree/master/components
including an XGBoost trainer https://github.com/Ark-kun/pipeline_components/blob/d8c4cf5/components/XGBoost/Train/component.py
and a full XGBoost pipeline: https://github.com/Ark-kun/pipeline_components/blob/4f19be6f26eaaf85ba251110d10d103b17e54a17/samples/Google_Cloud_Vertex_AI/Train_tabular_regression_model_using_XGBoost_and_import_to_Vertex_AI/pipeline.py

Related

Node: 'Cast_1' Cast string to float is not supported [[{{node Cast_1}}]] [Op:__inference_train_function_24202] for Roberta

I am getting this problem called "
Node: 'Cast_1'
Cast string to float is not supported
[[{{node Cast_1}}]] [Op:__inference_train_function_24202]
"
enter image description here
I wrote a code about IMDB sentiment analysis for 5000 data in Google Colab
#importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
#importing the data
data = pd.read_excel('/content/drive/MyDrive/499A_Project/Dataset/IMDB5000.xlsx')
#spliting the dataset into train and test
train_data, test_data = train_test_split(data, test_size = 0.3, random_state = 42)
#preprocessing the data
#tokenizng the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Review'].values)
#converting the text into sequences
train_sequences = tokenizer.texts_to_sequences(train_data['Review'].values)
test_sequences = tokenizer.texts_to_sequences(test_data['Review'].values)
#padding the sequences
max_length = max([len(s.split()) for s in data['Review']])
train_padded = pad_sequences(train_sequences, maxlen = max_length)
test_padded = pad_sequences(test_sequences, maxlen = max_length)
#preparing the labels
train_labels = train_data['Sentiment'].values
test_labels = test_data['Sentiment'].values
#importing Roberta model from transformers
from transformers import TFBertForSequenceClassification
#instantiating the Roberta model
model = TFBertForSequenceClassification.from_pretrained('roberta-base')
#compiling the model
model.compile(loss = 'sparse_categorical_crossentropy',
optimizer = 'adam',
metrics = ['accuracy'])
#training the model
model.fit(train_padded, train_labels,
batch_size = 32,
epochs = 10,
validation_data = (test_padded, test_labels))
This is the code I wrote for my dataset but it is not working and show the erros

Getting error while trying to fit to GridSearchCV

I am trying to fit a ridge regression model to my data using a pipeline and GridSearchCV.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
X = transformed_data.iloc[:, :-1]
y = transformed_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
params = {}
params['ridge__alpha'] = np.arange(0, 100, 1).tolist()
t = [('labelenc',LabelEncoder() , [0]), ('stand', StandardScaler(), [1,2,3,4,5,6]), ('poly'),PolynomialFeatures(degree=2),[1,2,3,4,5,6] ]
transformer = ColumnTransformer(transformers=t)
pipe = Pipeline(steps=[('t', transformer), ('m',Ridge())])
#grid_ridge2_r2 = GridSearchCV(pipe, params, cv=10, scoring='r2', n_jobs=-1)
#results_ridge2_r2 = grid_ridge2_r2.fit(X_train,y_train)
grid_ridge2_rmse = GridSearchCV(pipe, params, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1)
results_ridge2_rmse = grid_ridge2_rmse.fit(X_train,y_train)
I keep getting
ValueError: too many values to unpack (expected 3)
in the last line grid_ridge2_rmse.fit(X_train,y_train). My intuition is that there is something wrong with how I am splitting the dataset.
There is a few error within your pipeline.
First LabelEncoder cannot be used inside a scikit-learn pipeline as it is used to modify y not X. Assuming that you want to encode a categorical value of your feature it should be replaced by OrdinalEncoder.
Then, to set the grid parameter it has to be named with the following name convention <step>__<hyperparameter. Setting the ridge parameter in your case should be m__alpha.
The pipeline parameters can be seen using pipe.get_params().
I would do as follows:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
params = {'m__alpha' : np.arange(0, 100, 1).tolist()}
t = [
('labelenc',OrdinalEncoder() , [0]),
('stand', StandardScaler(), [1,2,3,4,5,6]),
('poly', PolynomialFeatures(degree=2), [1,2,3,4,5,6])
]
transformer = ColumnTransformer(transformers=t)
pipe = Pipeline(steps=[('t', transformer), ('m',Ridge())])
grid_ridge2_rmse = GridSearchCV(pipe, params, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1)
results_ridge2_rmse = grid_ridge2_rmse.fit(X_train,y_train)

Error in Machine Learning model into Flask Web Applications

I have created my machine learning model for heart disease prediction and now I want to deploy in my web application using FLASK. The Dataset as been acquired from Kaggle. Whenever I run the application I have some issues with my code whenever I execute it, it says:
C:\Users\Surface\Desktop\Flask_app>python app.py File "app.py", line 42
x_data = request.form['x_data']
^
IndentationError: unindent does not match any outer indentation level
Can anyone guide me Thankyou :)
from flask import Flask,render_template,url_for,request
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
app = Flask(__name__)
#app.route('/')
def home():
return render_template('home.html')
#app.route('/predict',method=['POST'])
def predict():
df = pd.read_csv("heart.csv")
df = df.drop(columns = ['cp', 'thal', 'slope'])
#features and labels
y = df.target.values
x_data = df.drop(['target'], axis = 1)
#EXTRACT Features
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(x_train.T, y_train.T)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(x_test.T,y_test.T)*100))
#persist model in a standard format
from sklearn.externals import joblib
joblib.dump(rf, 'HAP_model.pkl')
HAP_model = open('HAP_model.pkl','rb')
rf = joblib.load(HAP_model)
if request.method=='POST':
x_data = request.form['x_data']
data = [df.drop(['target'], axis = 1)]
vect = rf.transform(data).toarray()
my_prediction = rf.predict(vect)
return render_template('result.html',prediction = my_prediction)
if __name__ == '__main__':
app.run(debug=True)

Random Forest on Categorical Data with low accuracy

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)

Using an autoencoder to reduce dimensionality

Here is my version of an autoencoder written using PyTorch :
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
sns.set_style("darkgrid")
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
sns.set_style("darkgrid")
import torch
%matplotlib inline
f = []
f.append(np.random.uniform(0,10,(1 , 10)).flatten())
f.append(np.random.uniform(10,20,(1 , 10)).flatten())
f.append(np.random.uniform(20,30,(1 , 10)).flatten())
x_data = torch.FloatTensor(np.array(f))
x_data
dimensions_input = 10
hidden_layer_nodes = 5
output_dimension = 10
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = torch.nn.Linear(dimensions_input,hidden_layer_nodes)
self.sigmoid = torch.nn.Sigmoid()
self.linear2 = torch.nn.Linear(hidden_layer_nodes,output_dimension)
def forward(self, x):
l_out1 = self.linear(x)
l_out2 = self.sigmoid(l_out1)
y_pred = self.linear2(l_out2)
return y_pred
model = Model()
criterion = torch.nn.MSELoss(size_average = False)
optim = torch.optim.SGD(model.parameters(), lr = 0.00001)
def train_model():
y_data = x_data.clone()
for i in range(150000):
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
if i % 5000 == 0:
print(loss)
optim.zero_grad()
loss.backward()
optim.step()
Using x_data.clone() I train the network to learn a feature representation of the input data.
I'm attempting to generate hidden layer weights that match the dimensionality of rows of the input data so that each vector of x_data has a corresponding encoding. But the hidden later is of is a vector of size 5. How to change this network so that a matrix is generated that represents a reduced dimensionality of the input data ?

Resources