How to implement LIME in a Bert model? - machine-learning

I am new to machine learning. I noticed that such questions have been asked before as well but did not receive a proper solution. Below is the code for semantic similarity and I want to implement LIME as a base. Please, help me out.
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
# Two lists of sentences
sentences1 = ['The cat sits outside',
'A man is playing guitar',
'The new movie is awesome']
sentences2 = ['The cat sits outside',
'A woman watches TV',
'The new movie is so great']
#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
#Output the pairs with their score
for i in range(len(sentences1)):
print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

I don't know what Bert is, but try this sample code and see if it helps you.
import pandas as pd
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.utils import shuffle
from io import StringIO
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
df = pd.read_csv('C:\\Users\\ryans\\OneDrive\\Desktop\\Briefcase\\PDFs\\1-ALL PYTHON & R CODE SAMPLES\\A - GITHUB\\Natural Language Processing - Amazon Reviews\\Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
# let's experiment with some sentiment analysis concepts
# first we need to clean up the stuff in the independent field of the DF we are workign with
df.replace('\'','', regex=True, inplace=True)
df['review_title'] = df[['reviews.title']].astype(str)
df['review_text'] = df[['reviews.text']].astype(str)
df['review_title'] = df['reviews.title'].str.replace('\d+', '')
df['review_text'] = df['reviews.text'].str.replace('\d+', '')
# get rid of special characters
df['review_title'] = df['reviews.title'].str.replace(r'[^\w\s]+', '')
df['review_text'] = df['reviews.text'].str.replace(r'[^\w\s]+', '')
# get rid of double spaces
df['review_title'] = df['reviews.title'].str.replace(r'\^[a-zA-Z]\s+', '')
df['review_text'] = df['reviews.text'].str.replace(r'\^[a-zA-Z]\s+', '')
# convert all case to lower
df['review_title'] = df['reviews.title'].str.lower()
df['review_text'] = df['reviews.text'].str.lower()
list_corpus = df["review_text"].tolist()
list_labels = df["reviews.rating"].tolist()
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english', binary=True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors, y_train)
pred = logreg.predict(test_vectors)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
list_corpus[3]
c = make_pipeline(vectorizer, logreg)
class_names=list(df.review_title.unique())
explainer = LimeTextExplainer(class_names=class_names)
idx = 3
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, labels=[1, 1])
print('Document id: %d' % idx)
print('Predicted class =', class_names[logreg.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_test[idx]])
print ('Explanation for class %s' % class_names[1])
print ('\n'.join(map(str, exp.as_list(label=1))))
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, top_labels=2)
print(exp.available_labels())
exp.show_in_notebook(text=False)
https://towardsdatascience.com/explain-nlp-models-with-lime-shap-5c5a9f84d59b
https://marcotcr.github.io/lime/tutorials/Lime%20-%20multiclass.html
https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b

Related

Node: 'Cast_1' Cast string to float is not supported [[{{node Cast_1}}]] [Op:__inference_train_function_24202] for Roberta

I am getting this problem called "
Node: 'Cast_1'
Cast string to float is not supported
[[{{node Cast_1}}]] [Op:__inference_train_function_24202]
"
enter image description here
I wrote a code about IMDB sentiment analysis for 5000 data in Google Colab
#importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
#importing the data
data = pd.read_excel('/content/drive/MyDrive/499A_Project/Dataset/IMDB5000.xlsx')
#spliting the dataset into train and test
train_data, test_data = train_test_split(data, test_size = 0.3, random_state = 42)
#preprocessing the data
#tokenizng the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Review'].values)
#converting the text into sequences
train_sequences = tokenizer.texts_to_sequences(train_data['Review'].values)
test_sequences = tokenizer.texts_to_sequences(test_data['Review'].values)
#padding the sequences
max_length = max([len(s.split()) for s in data['Review']])
train_padded = pad_sequences(train_sequences, maxlen = max_length)
test_padded = pad_sequences(test_sequences, maxlen = max_length)
#preparing the labels
train_labels = train_data['Sentiment'].values
test_labels = test_data['Sentiment'].values
#importing Roberta model from transformers
from transformers import TFBertForSequenceClassification
#instantiating the Roberta model
model = TFBertForSequenceClassification.from_pretrained('roberta-base')
#compiling the model
model.compile(loss = 'sparse_categorical_crossentropy',
optimizer = 'adam',
metrics = ['accuracy'])
#training the model
model.fit(train_padded, train_labels,
batch_size = 32,
epochs = 10,
validation_data = (test_padded, test_labels))
This is the code I wrote for my dataset but it is not working and show the erros

Kubeflow Pipeline Training Component Failing | Unknown return type: <class 'inspect._empty'>

I am running an ML pipeline and the training component/step (see code below) continues to fail with the following error: "RuntimeError: Unknown return type: <class 'inspect._empty'>. Must be one of str, int, float, a subclass of Artifact, or a NamedTuple collection of these types."
Any ideas on what might be causing the issue/error and how to resolve it?
Thank you!
RE
#component(
# this component builds an xgboost classifier with xgboost
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml"
)
def build_xgb_xgboost(project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics: Output[Metrics],
model: Output[Model]
):
from google.cloud import bigquery
import xgboost as xgb
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error
import joblib
import pyarrow
import db_dtypes
client = bigquery.Client(project=project)
view_uri = f"{project}.{bq_dataset}.{test_view_name}" #replace view_name with test_view_name
build_df_for_xgboost = '''
SELECT * FROM `{view_uri}`
'''.format(view_uri = view_uri)
job_config = bigquery.QueryJobConfig()
df_1 = client.query(build_df_for_xgboost).to_dataframe()
#client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()
df = df_1.drop(['int64_field_0'], axis=1)
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column], prefix=column)
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
# Binary encoding
df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})
# One-hot encoding
for column in ['attacking_work_rate', 'defensive_work_rate']:
df = onehot_encode(df, column=column)
# Split df into X and y
y = df['overall_rating']
X = df.drop('overall_rating', axis=1)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
#specify parameters
#define your model
bst = XGBRegressor(
objective='reg:linear',
learning_rate = '.1',
alpha = '0.001'
)
#fit your model
bst.fit(X_train, y_train)
# Predict the model
y_pred = bst.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
mae = mean_absolute_error(y_test, y_pred)
metrics.log_metric("RMSE", rmse)
metrics.log_metric("framework", "xgboost")
metrics.log_metric("dataset_size", len(df))
metrics.log_metric("MAE", mae)
dump(bst, model.path + ".joblib")
I think this might just be a bug in the version of KFP v2 SDK code you're using.
I mostly use the stable KFPv1 methods to avoid problems.
from kfp.components import InputPath, OutputPath, create_component_from_func
def train_xgboost_model(
project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics_path: OutputPath(Metrics),
model_path: OutputPath(Model),
):
import json
from pathlib import Path
metrics = {
...
}
Path(metrics_path).write_text(json.dumps(metrics))
dump(bst, model_path)
train_xgboost_model_op = create_component_from_func(
func=train_xgboost_model,
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml",
)
You can also find many examples of real-world components in this repo: https://github.com/Ark-kun/pipeline_components/tree/master/components
including an XGBoost trainer https://github.com/Ark-kun/pipeline_components/blob/d8c4cf5/components/XGBoost/Train/component.py
and a full XGBoost pipeline: https://github.com/Ark-kun/pipeline_components/blob/4f19be6f26eaaf85ba251110d10d103b17e54a17/samples/Google_Cloud_Vertex_AI/Train_tabular_regression_model_using_XGBoost_and_import_to_Vertex_AI/pipeline.py

Error in Machine Learning model into Flask Web Applications

I have created my machine learning model for heart disease prediction and now I want to deploy in my web application using FLASK. The Dataset as been acquired from Kaggle. Whenever I run the application I have some issues with my code whenever I execute it, it says:
C:\Users\Surface\Desktop\Flask_app>python app.py File "app.py", line 42
x_data = request.form['x_data']
^
IndentationError: unindent does not match any outer indentation level
Can anyone guide me Thankyou :)
from flask import Flask,render_template,url_for,request
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
app = Flask(__name__)
#app.route('/')
def home():
return render_template('home.html')
#app.route('/predict',method=['POST'])
def predict():
df = pd.read_csv("heart.csv")
df = df.drop(columns = ['cp', 'thal', 'slope'])
#features and labels
y = df.target.values
x_data = df.drop(['target'], axis = 1)
#EXTRACT Features
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(x_train.T, y_train.T)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(x_test.T,y_test.T)*100))
#persist model in a standard format
from sklearn.externals import joblib
joblib.dump(rf, 'HAP_model.pkl')
HAP_model = open('HAP_model.pkl','rb')
rf = joblib.load(HAP_model)
if request.method=='POST':
x_data = request.form['x_data']
data = [df.drop(['target'], axis = 1)]
vect = rf.transform(data).toarray()
my_prediction = rf.predict(vect)
return render_template('result.html',prediction = my_prediction)
if __name__ == '__main__':
app.run(debug=True)

Random Forest on Categorical Data with low accuracy

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)

Using an autoencoder to reduce dimensionality

Here is my version of an autoencoder written using PyTorch :
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
sns.set_style("darkgrid")
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
sns.set_style("darkgrid")
import torch
%matplotlib inline
f = []
f.append(np.random.uniform(0,10,(1 , 10)).flatten())
f.append(np.random.uniform(10,20,(1 , 10)).flatten())
f.append(np.random.uniform(20,30,(1 , 10)).flatten())
x_data = torch.FloatTensor(np.array(f))
x_data
dimensions_input = 10
hidden_layer_nodes = 5
output_dimension = 10
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = torch.nn.Linear(dimensions_input,hidden_layer_nodes)
self.sigmoid = torch.nn.Sigmoid()
self.linear2 = torch.nn.Linear(hidden_layer_nodes,output_dimension)
def forward(self, x):
l_out1 = self.linear(x)
l_out2 = self.sigmoid(l_out1)
y_pred = self.linear2(l_out2)
return y_pred
model = Model()
criterion = torch.nn.MSELoss(size_average = False)
optim = torch.optim.SGD(model.parameters(), lr = 0.00001)
def train_model():
y_data = x_data.clone()
for i in range(150000):
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
if i % 5000 == 0:
print(loss)
optim.zero_grad()
loss.backward()
optim.step()
Using x_data.clone() I train the network to learn a feature representation of the input data.
I'm attempting to generate hidden layer weights that match the dimensionality of rows of the input data so that each vector of x_data has a corresponding encoding. But the hidden later is of is a vector of size 5. How to change this network so that a matrix is generated that represents a reduced dimensionality of the input data ?

Resources