Error in Machine Learning model into Flask Web Applications - machine-learning

I have created my machine learning model for heart disease prediction and now I want to deploy in my web application using FLASK. The Dataset as been acquired from Kaggle. Whenever I run the application I have some issues with my code whenever I execute it, it says:
C:\Users\Surface\Desktop\Flask_app>python app.py File "app.py", line 42
x_data = request.form['x_data']
^
IndentationError: unindent does not match any outer indentation level
Can anyone guide me Thankyou :)
from flask import Flask,render_template,url_for,request
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
app = Flask(__name__)
#app.route('/')
def home():
return render_template('home.html')
#app.route('/predict',method=['POST'])
def predict():
df = pd.read_csv("heart.csv")
df = df.drop(columns = ['cp', 'thal', 'slope'])
#features and labels
y = df.target.values
x_data = df.drop(['target'], axis = 1)
#EXTRACT Features
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(x_train.T, y_train.T)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(x_test.T,y_test.T)*100))
#persist model in a standard format
from sklearn.externals import joblib
joblib.dump(rf, 'HAP_model.pkl')
HAP_model = open('HAP_model.pkl','rb')
rf = joblib.load(HAP_model)
if request.method=='POST':
x_data = request.form['x_data']
data = [df.drop(['target'], axis = 1)]
vect = rf.transform(data).toarray()
my_prediction = rf.predict(vect)
return render_template('result.html',prediction = my_prediction)
if __name__ == '__main__':
app.run(debug=True)

Related

The number of classes has to be greater than one; got 1 class in SVM

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = np.linspace(-5.0, 5.0, 100)
y = np.sqrt(10**2 - x**2)
y=np.hstack([y,-y])
x=np.hstack([x,-x])
x1 = np.linspace(-5.0, 5.0, 100)
y1 = np.sqrt(5**2 - x1**2)
y1=np.hstack([y1,-y1])
x1=np.hstack([x1,-x1])
plt.scatter(y,x)
plt.scatter(y1,x1)
# print(plt.show())
import pandas as pd
df1 =pd.DataFrame(np.vstack([y,x]).T,columns=['X1','X2'])
df1['Y']=0
df2 =pd.DataFrame(np.vstack([y1,x1]).T,columns=['X1','X2'])
df2['Y']=1
df1.merge(df2)
# We need to find components for the Polynomical Kernel
#X1,X2,X1_square,X2_square,X1*X2
df1['X1_Square']= df1['X1']**2
df1['X2_Square']= df1['X2']**2
df1['X1*X2'] = (df1['X1'] *df1['X2'])
# print(df1.head())
### Independent and Dependent features
X = df1[['X1','X2','X1_Square','X2_Square','X1*X2']]
y = df1['Y']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25,
random_state = 0)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
classifier = SVC(kernel="linear")
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)
ValueError: The number of classes has to be greater than one; got 1 class
I Don't know How to resolve this one error.May be there is error in merge of two data frames or I to append df1 and df2 but i tried it that doesn't work.
The error is because y has only one value of 0 because your code logic is at line y = df1['Y'].
You can replace line df1.merge(df2) code like this:
df1 = pd.concat([df1,df2])

Kubeflow Pipeline Training Component Failing | Unknown return type: <class 'inspect._empty'>

I am running an ML pipeline and the training component/step (see code below) continues to fail with the following error: "RuntimeError: Unknown return type: <class 'inspect._empty'>. Must be one of str, int, float, a subclass of Artifact, or a NamedTuple collection of these types."
Any ideas on what might be causing the issue/error and how to resolve it?
Thank you!
RE
#component(
# this component builds an xgboost classifier with xgboost
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml"
)
def build_xgb_xgboost(project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics: Output[Metrics],
model: Output[Model]
):
from google.cloud import bigquery
import xgboost as xgb
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error
import joblib
import pyarrow
import db_dtypes
client = bigquery.Client(project=project)
view_uri = f"{project}.{bq_dataset}.{test_view_name}" #replace view_name with test_view_name
build_df_for_xgboost = '''
SELECT * FROM `{view_uri}`
'''.format(view_uri = view_uri)
job_config = bigquery.QueryJobConfig()
df_1 = client.query(build_df_for_xgboost).to_dataframe()
#client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()
df = df_1.drop(['int64_field_0'], axis=1)
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column], prefix=column)
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
# Binary encoding
df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})
# One-hot encoding
for column in ['attacking_work_rate', 'defensive_work_rate']:
df = onehot_encode(df, column=column)
# Split df into X and y
y = df['overall_rating']
X = df.drop('overall_rating', axis=1)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
#specify parameters
#define your model
bst = XGBRegressor(
objective='reg:linear',
learning_rate = '.1',
alpha = '0.001'
)
#fit your model
bst.fit(X_train, y_train)
# Predict the model
y_pred = bst.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
mae = mean_absolute_error(y_test, y_pred)
metrics.log_metric("RMSE", rmse)
metrics.log_metric("framework", "xgboost")
metrics.log_metric("dataset_size", len(df))
metrics.log_metric("MAE", mae)
dump(bst, model.path + ".joblib")
I think this might just be a bug in the version of KFP v2 SDK code you're using.
I mostly use the stable KFPv1 methods to avoid problems.
from kfp.components import InputPath, OutputPath, create_component_from_func
def train_xgboost_model(
project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics_path: OutputPath(Metrics),
model_path: OutputPath(Model),
):
import json
from pathlib import Path
metrics = {
...
}
Path(metrics_path).write_text(json.dumps(metrics))
dump(bst, model_path)
train_xgboost_model_op = create_component_from_func(
func=train_xgboost_model,
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml",
)
You can also find many examples of real-world components in this repo: https://github.com/Ark-kun/pipeline_components/tree/master/components
including an XGBoost trainer https://github.com/Ark-kun/pipeline_components/blob/d8c4cf5/components/XGBoost/Train/component.py
and a full XGBoost pipeline: https://github.com/Ark-kun/pipeline_components/blob/4f19be6f26eaaf85ba251110d10d103b17e54a17/samples/Google_Cloud_Vertex_AI/Train_tabular_regression_model_using_XGBoost_and_import_to_Vertex_AI/pipeline.py

How can I use "if __name__ == '__main__':" in my code below?

After I run my code I get a warning as:
"RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if name == 'main':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable."
My question is that how and where I should add this line of code to avoid this warning/error in my code below:
from scipy import stats, optimize
import pymc3 as pm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
#from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from theano import shared
np.random.seed(9)
#Load the Data
dataset = pd.read_csv(‘PV-PCM.csv’)
X=dataset.iloc[:,[0,1,2,3,4]].values
y=dataset.iloc[:,5].values
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.2, random_state=42)
#Shapes
X.shape, y.shape, X_tr.shape, X_te.shape
#Preprocess data for Modeling
shA_X = shared(X_tr)
#Generate Model
linear_model = pm.Model()
with linear_model:
# Priors for unknown model parameters
alpha = pm.Normal("alpha", mu=y_tr.mean(),sd=10)
betas = pm.Normal("betas", mu=0, sd=1000, shape=X.shape[1])
sigma = pm.HalfNormal("sigma", sd=100) # you could also try with a HalfCauchy that has longer/fatter tails
mu = alpha + pm.math.dot(betas, X_tr.T)
likelihood = pm.Normal("likelihood", mu=mu, sd=sigma, observed=y_tr)
step = pm.NUTS()
trace = pm.sample(1000, step)
chain = trace[100:]
#pm.traceplot(chain);
#Traceplot
pm.traceplot(trace)
ppc = pm.sample_prior_predictive(samples=1000, random_seed=9)
pm.plot_posterior(trace, figsize = (12, 10))
sns.kdeplot(y_tr, alpha=0.5, lw=4, c=‘b’)
for i in range(100):
sns.kdeplot(ppc[‘likelihood’][i], alpha=0.1, c=‘g’)
alpha_pred = chain[‘alpha’].mean()
betas_pred = chain[‘betas’].mean(axis=0)
y_pred = alpha_pred + np.dot(betas_pred, X_tr.T)
Thank you all.

Random Forest on Categorical Data with low accuracy

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)

Using an autoencoder to reduce dimensionality

Here is my version of an autoencoder written using PyTorch :
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
sns.set_style("darkgrid")
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
sns.set_style("darkgrid")
import torch
%matplotlib inline
f = []
f.append(np.random.uniform(0,10,(1 , 10)).flatten())
f.append(np.random.uniform(10,20,(1 , 10)).flatten())
f.append(np.random.uniform(20,30,(1 , 10)).flatten())
x_data = torch.FloatTensor(np.array(f))
x_data
dimensions_input = 10
hidden_layer_nodes = 5
output_dimension = 10
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = torch.nn.Linear(dimensions_input,hidden_layer_nodes)
self.sigmoid = torch.nn.Sigmoid()
self.linear2 = torch.nn.Linear(hidden_layer_nodes,output_dimension)
def forward(self, x):
l_out1 = self.linear(x)
l_out2 = self.sigmoid(l_out1)
y_pred = self.linear2(l_out2)
return y_pred
model = Model()
criterion = torch.nn.MSELoss(size_average = False)
optim = torch.optim.SGD(model.parameters(), lr = 0.00001)
def train_model():
y_data = x_data.clone()
for i in range(150000):
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
if i % 5000 == 0:
print(loss)
optim.zero_grad()
loss.backward()
optim.step()
Using x_data.clone() I train the network to learn a feature representation of the input data.
I'm attempting to generate hidden layer weights that match the dimensionality of rows of the input data so that each vector of x_data has a corresponding encoding. But the hidden later is of is a vector of size 5. How to change this network so that a matrix is generated that represents a reduced dimensionality of the input data ?

Resources