How can I make sentiment analysis with new sentence on trained model? - machine-learning

I trained a model by using Naive Bayes. I have high accuracy, but now I want to give a sentence then I want to see it's sentiment. Here it is my code:
# data Analysis
import pandas as pd
# data Preprocessing and Feature Engineering
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
# Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import joblib
import warnings
import mlflow
warnings.filterwarnings("ignore")
train_tweets = pd.read_csv('data/train.csv')
tweets = train_tweets.tweet.values
labels = train_tweets.label.values
processed_features = []
for sentence in range(0, len(tweets)):
# Remove all the special characters
processed_feature = re.sub(r'\W', ' ', str(tweets[sentence]))
# remove all single characters
processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
# Remove single characters from the start
processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
# Substituting multiple spaces with single space
processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
# Removing prefixed 'b'
processed_feature = re.sub(r'^b\s+', '', processed_feature)
# Converting to Lowercase
processed_feature = processed_feature.lower()
processed_features.append(processed_feature)
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)
text_classifier = MultinomialNB()
text_classifier.fit(X_train, y_train)
predictions = text_classifier.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))
joblib.dump(text_classifier, 'model.pkl')
As you can see, I'm saving my model. Now, I want give an input like this:
new_sentence = "I am very happy today"
model.predict(new_sentence)
And I want see something like this as an output:
sentence = "I am very happy today"
sentiment = Positive
How can I do that?

First, put the preprocessing in a function:
def preproc(tweets):
processed_features = []
for sentence in range(0, len(tweets)):
# Remove all the special characters
processed_feature = re.sub(r'\W', ' ', str(tweets[sentence]))
# remove all single characters
processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
# Remove single characters from the start
processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
# Substituting multiple spaces with single space
processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
# Removing prefixed 'b'
processed_feature = re.sub(r'^b\s+', '', processed_feature)
# Converting to Lowercase
processed_feature = processed_feature.lower()
processed_features.append(processed_feature)
return processed_features
processed_features = preproc(tweets)
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()
Then use it to preprocess the test string and feed it to the classifier using transform:
# feeding two 1-sentence tweets:
test = preproc([["I hate this book."], ["I love this movie."]])
predictions = text_classifier.predict(vectorizer.transform(test).toarray())
print(predictions)
Now, depending on what labels you have in the dataset and how train_tweets.label.values is coded, you will get different output that you can parse into a string. For example, if the labels in the dataset are coded as 1=positive and 0=negative, you might get [0,1].

Related

Sequential Model incompatible with layer

I've recently updated my project to include more intents for my NLU chatbot. I retrained the model. However, when I make an input into the program I receive an error message saying
File "C:\Users\jiann\ChatBot - Copy\chatbot.py", line 39, in predict_clas
s
res = model.predict(np.array([bow]))[0]
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pack
ages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pack
ages\tensorflow\python\framework\func_graph.py", line 1147, in autograph_ha
ndler
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1801, in predict_function *
return step_function(self, iterator)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1790, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1783, in run_step **
outputs = model.predict_step(data)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1751, in predict_step
return self(x, training=False)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
ckages\keras\engine\input_spec.py", line 264, in assert_input_compatibilityckages\keras\engine\input_spec.py", line 264, in assert_input_compatibilityckage
raise ValueError(f'Input {input_index} of layer "{layer_name}" is ' raise ValueError(f'Input {input_index} of layer "{layer_name}" is '
ValueError: Input 0 of layer "sequential" is incompatible with the laye
r: expected shape=(None, 9), found shape=(None, 40)
This error only pops up when I include more than one Intent. Below I've include the relevant code for the Sequential model and the Intents:
Intents.json:
{"intents": [
{"tag": "greeting",
"patterns": ["Hi", "How are you", "Is anyone there?", "Hello", "Good day", "Whats up", "Hey", "greetings"],
"responses": ["Hello!", "Good to see you again!", "Hi there, how can I help?"],
"context_set": ""
},
{"tag": "goodbye",
"patterns": ["cya", "See you later", "Goodbye", "I am Leaving", "Have a Good day", "bye", "cao", "see ya"],
"responses": ["Sad to see you go :(", "Talk to you later", "Goodbye!"],
"context_set": ""
},
{"tag": "stocks",
"patterns": ["what stocks do I own?", "how are my shares?", "what companies am I investing in?", "what am I doing in the markets?"],
"responses": ["You own the following shares: ABBV, AAPL, FB, NVDA and an ETF of the S&P 500 Index!"],
"context_set": ""
}
]
}
training.py:
import random
import json
import pickle
import numpy as np
import nltk
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizer_v2.gradient_descent import SGD
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Lemmatizer uses stem of a word instead of conjugate (performance purposes)
from nltk.stem import WordNetLemmatizer
from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers i
# mport Dense, Activation, Dropout
# from tensorflow.keras.optimizers import SGD
lemmatizer = WordNetLemmatizer()
# Reading json file, pass to load function, get json object dictionary
intents = json.loads(open('intents.json').read())
words = []
classes = []
documents = []
# Characters that you won't pay attention to
ignore_letters = ['?', '!', '.', ',']
# Splits each pattern entry into individual words
for intent in intents['intents']:
for pattern in intent['patterns']:
word_list = nltk.word_tokenize(pattern)
words.extend(word_list)
#Wordlist belongs to specific tag
documents.append((word_list, intent['tag']))
if intent['tag'] not in classes:
classes.append(intent['tag'])
print(documents)
#lemmatizes word inf word list if it is not ignored
words = [lemmatizer.lemmatize(word) for word in words if word not in ignore_letters]
#Set Eliminates duplicate words
words = sorted(set(words))
classes = sorted(set(classes))
#Save the words in file
pickle.dump(words,open('words.pkl','wb'))
#Save classes in file
pickle.dump(classes,open('classes.pkl','wb'))
#CREATING THE TRAINING DATA
#Set individual word values to 0 or 1 depending on whether it occurs
training = []
output_empty = [0] * len(classes)
for document in documents:
bag = []
word_patterns = document[0]
word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
for word in words:#checks to see if word is in pattern
bag.append(1) if word in word_patterns else bag.append(0)
output_row = list(output_empty)
#want to know class at index 1, want to know index,
# add class to oupt_row to 1
output_row[classes.index(document[1])] = 1
training.append([bag, output_row])
#shuffle the data
random.shuffle(training)
#turn into numpy array
training = np.array(training)
#split into x and y values, Features & Labels
train_x =list(training[:,0])
train_y = list(training[:,1])
#Start building Neural Network Model
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]),activation='softmax'))
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbotmodel.h5',hist)
print('done')
chatbot.py:
import random
import pickle
import numpy as np
import nltk
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from nltk.stem import WordNetLemmatizer
from keras.models import load_model
lemmatizer = WordNetLemmatizer()
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
model = load_model('chatbot_model.model')
print(classes)
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bag_of_words(sentence):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bag_of_words(sentence)
res = model.predict(np.array([bow]))[0]
# allows for certain uncertainty.
# If Uncertainty is too high it won't allow to be taken into account
ERROR_THRESHOLD = 0.25
results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
return return_list
def get_response(intents_list, intents_json):
tag = intents_list[0]['intent']
list_of_intents = intents_json['intents']
for i in list_of_intents:
if i['tag'] == tag:
result = random.choice(i['responses'])
break
return result
print("Go! Bot is running!")
If I had to take a guess, it would be something wrong with the shape. I'm just not sure how to fix this.
There seems to be a mismatch between the input_shape of your model and the training sample(s) you are providing. I believe the issue stems from these two lines:
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
and,
res = model.predict(np.array([bow]))[0]
Depending on what value is returned by len(train_x[0]), calling model.predict() on np.array[bow] may not work if np.array[bow] does not match the input shape specified. Check out this answer for an in-depth explanation of how the various Keras inputs work.

How to implement LIME in a Bert model?

I am new to machine learning. I noticed that such questions have been asked before as well but did not receive a proper solution. Below is the code for semantic similarity and I want to implement LIME as a base. Please, help me out.
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
# Two lists of sentences
sentences1 = ['The cat sits outside',
'A man is playing guitar',
'The new movie is awesome']
sentences2 = ['The cat sits outside',
'A woman watches TV',
'The new movie is so great']
#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
#Output the pairs with their score
for i in range(len(sentences1)):
print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
I don't know what Bert is, but try this sample code and see if it helps you.
import pandas as pd
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.utils import shuffle
from io import StringIO
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
df = pd.read_csv('C:\\Users\\ryans\\OneDrive\\Desktop\\Briefcase\\PDFs\\1-ALL PYTHON & R CODE SAMPLES\\A - GITHUB\\Natural Language Processing - Amazon Reviews\\Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
# let's experiment with some sentiment analysis concepts
# first we need to clean up the stuff in the independent field of the DF we are workign with
df.replace('\'','', regex=True, inplace=True)
df['review_title'] = df[['reviews.title']].astype(str)
df['review_text'] = df[['reviews.text']].astype(str)
df['review_title'] = df['reviews.title'].str.replace('\d+', '')
df['review_text'] = df['reviews.text'].str.replace('\d+', '')
# get rid of special characters
df['review_title'] = df['reviews.title'].str.replace(r'[^\w\s]+', '')
df['review_text'] = df['reviews.text'].str.replace(r'[^\w\s]+', '')
# get rid of double spaces
df['review_title'] = df['reviews.title'].str.replace(r'\^[a-zA-Z]\s+', '')
df['review_text'] = df['reviews.text'].str.replace(r'\^[a-zA-Z]\s+', '')
# convert all case to lower
df['review_title'] = df['reviews.title'].str.lower()
df['review_text'] = df['reviews.text'].str.lower()
list_corpus = df["review_text"].tolist()
list_labels = df["reviews.rating"].tolist()
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english', binary=True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors, y_train)
pred = logreg.predict(test_vectors)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
list_corpus[3]
c = make_pipeline(vectorizer, logreg)
class_names=list(df.review_title.unique())
explainer = LimeTextExplainer(class_names=class_names)
idx = 3
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, labels=[1, 1])
print('Document id: %d' % idx)
print('Predicted class =', class_names[logreg.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_test[idx]])
print ('Explanation for class %s' % class_names[1])
print ('\n'.join(map(str, exp.as_list(label=1))))
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, top_labels=2)
print(exp.available_labels())
exp.show_in_notebook(text=False)
https://towardsdatascience.com/explain-nlp-models-with-lime-shap-5c5a9f84d59b
https://marcotcr.github.io/lime/tutorials/Lime%20-%20multiclass.html
https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b

Trying to predict running time of algorithms through regression

I'm following this paper:
http://robotics.stanford.edu/users/shoham/www%20papers/Empirical%20Hardness.pdf
and I try to predict the running time for the Traveling salesman problem on a blackbox solver.
I get some weird results during regression that I'd love to consult about:
I find it hard to believe that in XGBOOST or at any regessor the number of cities is irrelevant as a feature? as seen in XGBOOST feature importance image.
In the RIDGE and LINEAR REGRESSION results graphs you can see that for some problem instances the graphs you can see that the predicted value is negative (when we talk about run time), I saw in other question here that this is because "Linear regression does not respect the bounds of 0" and that I should put a natural log on it, but I don't know exactly where. So I'd love help with that also.
I'd also love to be reccomended on other regression models that may fit my problem.
Thanks a lot!
Here is my code pieces (google colab), followed by the results I got:
1
# Import the standard libraries of pandas.
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')
from google.colab import files
2
# Install the solver and import its libraries, in addition import all the
# libraries with which we will prepare the features.
!pip3 install ortools
!pip install python-igraph
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
import numpy as np
import time
import random
from random import randrange
from scipy import stats
from scipy.stats import skew
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse.csgraph import depth_first_tree
from igraph import Graph, mean
import igraph
import itertools
import math
3
# Simple travelling salesman problem between cities - solver OR Tools By Google.
def create_data_model():
# Stores the data for the problem.
data = {}
# dim will be the number of Vertices\Cities in the Traveling Salesman Problem.
# Randomly select the matrix dimension in unifom distribution.
dim = np.random.randint(10, 350)
# Generate a square symmetric matrix It will be the distance matrix that the solver will solve.
square_matrice = [[0 for row in range(dim)] for col in range(dim)]
for i in range(dim):
for j in range(dim):
if i == j:
square_matrice[i][j] = 0
else:
# Randomly fill the matrix in unifom distribution.
square_matrice[i][j] = square_matrice[j][i] = np.random.randint(1, 1000)
data['distance_matrix'] = square_matrice # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def main():
# Start measuring solution time.
start_time = time.time()
# Instantiate the data problem.
data = create_data_model()
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),
data['num_vehicles'], data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
# Returns the distance between the two nodes.
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.first_solution_strategy = (
routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC)
# Solve the problem.
solution = routing.SolveWithParameters(search_parameters)
solution_time = time.time() - start_time
'''In this part of the code we will create the following features on the distance matrix of the problem.
* Mean - Average weights of the distance matrix.
* Std - Standard Deviation of the distance matrix.
* Skewness - What is the tendency of the weights in the distance matrix.
* Noc - Number of cities we have in the distance matrix [matrix dimension].
* Td - The total distance of the solution rout.
* Dmft - Distance matrix features time, That is how long it took us to calculate all these features.
'''
dmt_start_time = time.time()
mat = np.array(data['distance_matrix'])
mean = mat.mean()
std = mat.std()
merged = list(itertools.chain(*mat))
skewness = skew(merged)
noc = len(data['distance_matrix'])
td = solution.ObjectiveValue() if solution else -1
dmft = time.time() - dmt_start_time
'''In this part of the code we will create from the distance matrix of the problem an MST and than
on the MST we take the following features.
* MST_Mean - Average weights of the MST.
* MST_Std - Standard Deviation of the MST.
* MST_Skewness - What is the tendency of the weights in the MST.
* MST_ft - MST features time, That is how long it took us to calculate the MST & all these features.
'''
spt_start_time = time.time()
X = csr_matrix(mat)
Tcsr = minimum_spanning_tree(X)
mat_st = np.array(Tcsr.toarray().astype(int))
mst_mean = mat_st.mean()
mst_std = mat_st.std()
merged_st = list(itertools.chain(*mat_st))
mst_skewness = skew(merged_st)
mst_ft = time.time() - spt_start_time
'''In this part of the code we calculate features from the MST that are considered to be
related to the rank and depth of the tracks in it.
* D_Mean - Average degree of the MST.
* D_Std - Standard Deviation of the MST degrees.
* D_Skewness - What is the tendency of the degrees in the MST.
* DFT_Mean - The average weight of the deepest track in MST.
* DFT_Std - Standard Deviation of the deepest track in MST.
* DFT_Max - The heaviest arch on the longest route in MST.
* DDFT_ft - Degree & DFT features time, That is how long it took us to calculate all these features.
'''
dstt_start_time = time.time()
g = Graph.Weighted_Adjacency(mat_st.tolist())
d_mean = igraph.statistics.mean(g.degree())
d_std = igraph.statistics.sd(g.degree())
d_skewness = skew(g.degree())
d_t = depth_first_tree(X, 0, directed=False)
mat_dt = np.array(d_t.toarray().astype(int))
dft_mean = mat_dt.mean()
dft_std = mat_dt.std()
dft_max = np.amax(mat_dt)
ddft_ft = time.time() - dstt_start_time
# In this map we will hold all the features and their results.
features_map = {'Mean': mean, 'Std': std, 'Skewness': skewness, 'Noc': noc, 'Td': td, 'Dmft': dmft,
'MST_Mean': mst_mean, 'MST_Std': mst_std, 'MST_Skewness': mst_skewness, 'MST_ft': mst_ft,
'D_Mean': d_mean, 'D_Std': d_std, 'D_Skewness': d_skewness, 'DFT_Mean': dft_mean,'DFT_Std': dft_std,
'DFT_Max': dft_max, 'DDFT_ft': ddft_ft, 'Solution_time': solution_time}
return features_map
# Main
# Create dataFrame.
data_TSP = pd.DataFrame()
# Fill the dataFrame.
for i in range(10000):
#print(i)
features_map = main()
data_TSP = data_TSP.append(features_map, ignore_index=True)
# Show data frame.
data_TSP.head()
data_TSP.to_csv('data_10000.csv')
files.download('data_10000.csv')
Regression models:
# Import the standard libraries of pandas.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')
2
# Neaded for opening data file in drive.
from google.colab import files
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['data_10000_clean.csv']))
try:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
except:
pass
df.head()
from sklearn.model_selection import train_test_split
# Split the data to training set and test set (70%, 30%)
features = list(df.drop('Solution_time', axis = 1, inplace = False))
y = df['Solution_time']
X = df[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
Import models which we predicted with tham the solution,
And scoring methods to evaluate these models.
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
!pip3 install xgboost
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
!pip install scikit-plot
import scikitplot as skplt
import matplotlib as mpl
########################################## Several functions for different regression models. ##########################################
class Score:
r2 = 0.0 # This score determines how close the predictions really are to the real data.
cross_vali_score = 0.0 # How true is our algorithm if it going to well predict new data.
class Regressor:
def __init__(self, name):
self.name = name
self.score = Score()
self.y_pred = None
self.reg = None
# Map between the name of a model and the model itself.
models_map = {'Random Forest': RandomForestRegressor(), 'Xgboost': XGBRegressor(), 'Ridge': Ridge(),
'Kneighbors': KNeighborsRegressor(), 'Linear Regressor': linear_model.LinearRegression()}
# This function return a map that maps between each model and its Regressor class.
def get_models():
result_map = {}
for key, val in models_map.items():
result = Regressor(key)
reg = val
result.score.cross_vali_score = np.mean(cross_val_score(reg, X_train, y_train, cv=5))
result.reg = reg.fit(X_train, y_train)
result.y_pred = reg.predict(X_test)
result.score.r2 = r2_score(y_test, result.y_pred)
result_map[key] = result
return result_map
# This function print a graph for models of the features that influenced their decision making.
def print_influence_graph(map):
for key, val in map.items():
if key == 'Random Forest' or key == 'Xgboost':
# The parameters that most influenced the decision.
feature_imp = pd.Series(val.reg.feature_importances_,index=features).sort_values(ascending=False)
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title(val.name.upper() +" - Visualizing Important Features")
plt.show()
# This function print a graph for models that show the real results against the model predictions.
def show_predicted_vs_actual(map):
for key, val in map.items():
fig, ax = plt.subplots()
ax.scatter(y_test, val.y_pred, edgecolors=(0, 0, 1))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.title.set_text(val.name.upper() +" - Predicted time vs actual time")
plt.show()
# This function print numerical scores for the models.
def print_scores(map):
for key, val in map.items():
print(val.name.upper() + ' SCORE: ')
print('R2score' + ' = ', val.score.r2)
print('Cross_val_score' + ' = ', val.score.cross_vali_score)
print('------------------------------------------\n')
# This function print a graph showing the differences between the scores of the models.
def show_models_differences_graph(map):
comp_df = pd.DataFrame(columns = ('Method', 'R2 Score', 'Cross val score'))
for i in map:
row = {'Method': i, 'R2 Score': map[i].score.r2, 'Cross val score': map[i].score.cross_vali_score}
comp_df = comp_df.append(row, ignore_index=True)
ax = comp_df.plot.bar(x='Method', rot=30, figsize=(12,6))
ax.set_title('Comparison graph')
#########################################################################################################################################
models = get_models()
print_influence_graph(models)
show_predicted_vs_actual(models)
print_scores(models)
show_models_differences_graph(models)
And here are the results:

how to view tf-idf score against each word

I was trying to know the tf-idf scores of each word in my document. However, it only returns values in the matrix but I see a specific type of representation of tf-idf scores against each word.
I have used processed and the code works however I want to change the way it is presented:
code:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
bow_transformer = CountVectorizer(analyzer=text_process).fit(df["comments"].head())
print(len(bow_transformer.vocabulary_))
tfidf_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])
bow_transformer.vocabulary_transformer().fit(message_bow)
message_tfidf = tfidf_transformer.transform(message_bow)
I get the results like this (39028,01),(1393,1672). However, I expect the results to be like
features tfidf
fruit 0.00344
excellent 0.00289
You can achieve the above result by using following code:
def extract_topn_from_vector(feature_names, sorted_items, topn=5):
"""
get the feature names and tf-idf score of top n items in the doc,
in descending order of scores.
"""
# use only top n items from vector.
sorted_items = sorted_items[:topn]
results= {}
# word index and corresponding tf-idf score
for idx, score in sorted_items:
results[feature_names[idx]] = round(score, 3)
# return a sorted list of tuples with feature name and tf-idf score as its element(in descending order of tf-idf scores).
return sorted(results.items(), key=lambda kv: kv[1], reverse=True)
feature_names = count_vect.get_feature_names()
coo_matrix = message_tfidf.tocoo()
tuples = zip(coo_matrix.col, coo_matrix.data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# extract only the top n elements.
# Here, n is 10.
word_tfidf = extract_topn_from_vector(feature_names, sorted_items, 10)
print("{} {}".format("features", "tfidf"))
for k in word_tfidf:
print("{} - {}".format(k[0], k[1]))
Check out the full code below to get a better idea of above code snippet.
The below code is self-explanatory.
Full Code:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import string
import nltk
import pandas as pd
data = pd.read_csv('yourfile.csv')
stops = set(stopwords.words("english"))
wl = nltk.WordNetLemmatizer()
def clean_text(text):
"""
- Remove Punctuations
- Tokenization
- Remove Stopwords
- stemming/lemmatizing
"""
text_nopunct = "".join([char for char in text if char not in string.punctuation])
tokens = re.split("\W+", text)
text = [word for word in tokens if word not in stops]
text = [wl.lemmatize(word) for word in text]
return text
def extract_topn_from_vector(feature_names, sorted_items, topn=5):
"""
get the feature names and tf-idf score of top n items in the doc,
in descending order of scores.
"""
# use only top n items from vector.
sorted_items = sorted_items[:topn]
results= {}
# word index and corresponding tf-idf score
for idx, score in sorted_items:
results[feature_names[idx]] = round(score, 3)
# return a sorted list of tuples with feature name and tf-idf score as its element(in descending order of tf-idf scores).
return sorted(results.items(), key=lambda kv: kv[1], reverse=True)
count_vect = CountVectorizer(analyzer=clean_text, tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
freq_term_matrix = count_vect.fit_transform(data['text_body'])
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
feature_names = count_vect.get_feature_names()
# sample document
doc = 'watched horrid thing TV. Needless say one movies watch see much worse get.'
tf_idf_vector = tfidf.transform(count_vect.transform([doc]))
coo_matrix = tf_idf_vector.tocoo()
tuples = zip(coo_matrix.col, coo_matrix.data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# extract only the top n elements.
# Here, n is 10.
word_tfidf = extract_topn_from_vector(feature_names,sorted_items,10)
print("{} {}".format("features", "tfidf"))
for k in word_tfidf:
print("{} - {}".format(k[0], k[1]))
Sample output:
features tfidf
Needless - 0.515
horrid - 0.501
worse - 0.312
watched - 0.275
TV - 0.272
say - 0.202
watch - 0.199
thing - 0.189
much - 0.177
see - 0.164
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(documents["comments"])
df = pd.DataFrame(tfidf_matrix.toarray(),columns=vect.get_feature_names())
print(df)
sklearn : TFIDF Transformer : How to get tf-idf values of given words in document

Custom feature extraction class in scikit-learn

I am very beginner to the scikit-learn .I am working on some classification problem for which I have to build some custom feature extraction class or method to find the features for the training data.
I have made my custom feature extraction class as explain in this link. When i run my code it shows me this error :-
Traceback (most recent call last):
File "test.py", line 248, in <module>
pred = pipe.predict(X_test)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.py", line 327, in predict
return self.steps[-1][-1].predict(Xt)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 336, in predict
scores = self.decision_function(X)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 317, in decision_function
% (X.shape[1], n_features))
ValueError: X has 174 features per sample; expecting 443
Below is my code snippet , also i have given my full code. Please tell me where i am doing wrong and why , along with the suggestions so that my code will run without any error.
Code snippet :-
Here "y" is a list of all categories or labelled group ."corpus" is the list of all documents (data) , where each doc. is represented like a string."tfidf" and "lda" are my two functions from which i am generating my feature vector
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)
full code :-
# -*- coding: utf-8 -*-
#! /usr/bin/env python3
from gensim import corpora, models
import gensim
from operator import itemgetter
import numpy as np
import sys
import os
import re
import codecs
import io
import math
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn import svm
from sklearn import metrics
from sklearn.pipeline import make_pipeline , Pipeline
reload(sys)
sys.setdefaultencoding('utf8')
np.set_printoptions(threshold='nan')
suffixes = {
1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}
categories=['A','C','D','E']
mappings={}
mappings['A']=1
mappings['C']=3
mappings['D']=4
mappings['E']=5
path='/home/priyansh/Downloads/ltrc/1055/'
train_data_path='/home/priyansh/Downloads/ltrc/extractor/clustering/four_class_devanagari/'
path1=train_data_path+"A/"
path2=train_data_path+"C/"
path3=train_data_path+"D/"
path4=train_data_path+"E/"
documents=[] #contains all doc filenames along with class labels
doc_info_with_label=[] #two tuple storage of doc info along with their respective labels
def hi_stem(word):
for L in 5, 4, 3, 2, 1:
if len(word) > L + 1:
for suf in suffixes[L]:
if word.endswith(suf):
return word[:-L]
return word
def store_data(dir_path_list):
for dir_path in dir_path_list:
class_name = dir_path.split("/")[8]
for filename in os.listdir(dir_path):
if filename not in documents:
documents.append(filename+"+"+str(mappings[class_name]))
infilename=os.path.join(dir_path,filename)
with codecs.open(infilename,'r','utf-8') as fl:
string=''
for line in fl:
for word in line.split():
if word!=" " or word!="\n":
string+=word+" "
fl.close()
temp=[]
temp.append(class_name)
temp.append(string)
doc_info_with_label.append(tuple(temp))
path_list=[]
path_list.append(path1)
path_list.append(path2)
path_list.append(path3)
path_list.append(path4)
store_data(path_list)
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
def tfidf_score(word,document_no,corpus_data):
#print word
my_word=word
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
document=corpus_data[document_no]
#print document
wordcount=0
total=0
temp = document.split()
for i in temp:
#print i
if i not in stopwords:
total+=1
if i==my_word:
#print my_word
#print word
wordcount+=1
#print wordcount
#print total
tf = float(wordcount)/total
#print tf
#return tf(word,document)*idf(word,corpus_data)
total_docs = len(corpus_data)
count=0
for doc in corpus_data:
temp=[]
temp = doc.split()
for i in temp:
if i==word:
count+=1
break
total_docs_which_contains_the_words=count
idf = math.log(total_docs/(1+total_docs_which_contains_the_words))
return tf*idf
def tfidf(corpus_data):
word_id_mapping={}
cnt=0
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
unique_words_in_corpus={}
count=0
for data in corpus_data:
corpus_id=count
temp=[]
temp=data.split()
for word in temp:
if word not in unique_words_in_corpus:
unique_words_in_corpus[word]=corpus_id
count+=1
stopped_unique_words_in_corpus={}
for word in unique_words_in_corpus:
if word not in stopwords:
stopped_unique_words_in_corpus[word]=unique_words_in_corpus[word]
word_id_mapping[word]=cnt
cnt+=1
#print unique_words_in_corpus
#print stopped_unique_words_in_corpus
#print word_id_mapping
feature_vect=[None]*len(corpus_data)
#score_vect=[None]*cnt
for i in range(0,len(corpus_data)):
score_vect=[0]*cnt
for word in stopped_unique_words_in_corpus:
if i==stopped_unique_words_in_corpus[word]:
#print word
score=tfidf_score(word,i,corpus_data)
#print score
score_vect[word_id_mapping[word]]=score
feature_vect[i]=score_vect
return feature_vect
def lda(corpus_data):
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
texts=[]
for data in corpus_data:
#print data
tokens=[]
temp=[]
stopped_tokens=[]
temp = data.split()
for word in temp:
tokens.append(word)
#print tokens
for i in tokens:
if i not in stopwords:
stopped_tokens.append(i)
stemmed_tokens=[]
for token in stopped_tokens:
stemmed_token = hi_stem(token)
stemmed_tokens.append(stemmed_token)
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
num_topics=5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=10)
doc_topics=[]
for doc_vector in corpus:
doc_topics.append(ldamodel[doc_vector])
for i in range(0,len(doc_topics)):
doc_topics[i] = sorted(doc_topics[i],key=itemgetter(1),reverse=True)
feature_vect=[]
for i in doc_topics:
prob_vect=[0]*num_topics
#print i
topic_num = i[0][0]
topic_prob = i[0][1]
prob_vect[topic_num]=topic_prob
feature_vect.append(prob_vect)
#print i
#print feature_vect
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)

Resources