Custom feature extraction class in scikit-learn - machine-learning

I am very beginner to the scikit-learn .I am working on some classification problem for which I have to build some custom feature extraction class or method to find the features for the training data.
I have made my custom feature extraction class as explain in this link. When i run my code it shows me this error :-
Traceback (most recent call last):
File "test.py", line 248, in <module>
pred = pipe.predict(X_test)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.py", line 327, in predict
return self.steps[-1][-1].predict(Xt)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 336, in predict
scores = self.decision_function(X)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 317, in decision_function
% (X.shape[1], n_features))
ValueError: X has 174 features per sample; expecting 443
Below is my code snippet , also i have given my full code. Please tell me where i am doing wrong and why , along with the suggestions so that my code will run without any error.
Code snippet :-
Here "y" is a list of all categories or labelled group ."corpus" is the list of all documents (data) , where each doc. is represented like a string."tfidf" and "lda" are my two functions from which i am generating my feature vector
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)
full code :-
# -*- coding: utf-8 -*-
#! /usr/bin/env python3
from gensim import corpora, models
import gensim
from operator import itemgetter
import numpy as np
import sys
import os
import re
import codecs
import io
import math
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn import svm
from sklearn import metrics
from sklearn.pipeline import make_pipeline , Pipeline
reload(sys)
sys.setdefaultencoding('utf8')
np.set_printoptions(threshold='nan')
suffixes = {
1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}
categories=['A','C','D','E']
mappings={}
mappings['A']=1
mappings['C']=3
mappings['D']=4
mappings['E']=5
path='/home/priyansh/Downloads/ltrc/1055/'
train_data_path='/home/priyansh/Downloads/ltrc/extractor/clustering/four_class_devanagari/'
path1=train_data_path+"A/"
path2=train_data_path+"C/"
path3=train_data_path+"D/"
path4=train_data_path+"E/"
documents=[] #contains all doc filenames along with class labels
doc_info_with_label=[] #two tuple storage of doc info along with their respective labels
def hi_stem(word):
for L in 5, 4, 3, 2, 1:
if len(word) > L + 1:
for suf in suffixes[L]:
if word.endswith(suf):
return word[:-L]
return word
def store_data(dir_path_list):
for dir_path in dir_path_list:
class_name = dir_path.split("/")[8]
for filename in os.listdir(dir_path):
if filename not in documents:
documents.append(filename+"+"+str(mappings[class_name]))
infilename=os.path.join(dir_path,filename)
with codecs.open(infilename,'r','utf-8') as fl:
string=''
for line in fl:
for word in line.split():
if word!=" " or word!="\n":
string+=word+" "
fl.close()
temp=[]
temp.append(class_name)
temp.append(string)
doc_info_with_label.append(tuple(temp))
path_list=[]
path_list.append(path1)
path_list.append(path2)
path_list.append(path3)
path_list.append(path4)
store_data(path_list)
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
def tfidf_score(word,document_no,corpus_data):
#print word
my_word=word
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
document=corpus_data[document_no]
#print document
wordcount=0
total=0
temp = document.split()
for i in temp:
#print i
if i not in stopwords:
total+=1
if i==my_word:
#print my_word
#print word
wordcount+=1
#print wordcount
#print total
tf = float(wordcount)/total
#print tf
#return tf(word,document)*idf(word,corpus_data)
total_docs = len(corpus_data)
count=0
for doc in corpus_data:
temp=[]
temp = doc.split()
for i in temp:
if i==word:
count+=1
break
total_docs_which_contains_the_words=count
idf = math.log(total_docs/(1+total_docs_which_contains_the_words))
return tf*idf
def tfidf(corpus_data):
word_id_mapping={}
cnt=0
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
unique_words_in_corpus={}
count=0
for data in corpus_data:
corpus_id=count
temp=[]
temp=data.split()
for word in temp:
if word not in unique_words_in_corpus:
unique_words_in_corpus[word]=corpus_id
count+=1
stopped_unique_words_in_corpus={}
for word in unique_words_in_corpus:
if word not in stopwords:
stopped_unique_words_in_corpus[word]=unique_words_in_corpus[word]
word_id_mapping[word]=cnt
cnt+=1
#print unique_words_in_corpus
#print stopped_unique_words_in_corpus
#print word_id_mapping
feature_vect=[None]*len(corpus_data)
#score_vect=[None]*cnt
for i in range(0,len(corpus_data)):
score_vect=[0]*cnt
for word in stopped_unique_words_in_corpus:
if i==stopped_unique_words_in_corpus[word]:
#print word
score=tfidf_score(word,i,corpus_data)
#print score
score_vect[word_id_mapping[word]]=score
feature_vect[i]=score_vect
return feature_vect
def lda(corpus_data):
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
texts=[]
for data in corpus_data:
#print data
tokens=[]
temp=[]
stopped_tokens=[]
temp = data.split()
for word in temp:
tokens.append(word)
#print tokens
for i in tokens:
if i not in stopwords:
stopped_tokens.append(i)
stemmed_tokens=[]
for token in stopped_tokens:
stemmed_token = hi_stem(token)
stemmed_tokens.append(stemmed_token)
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
num_topics=5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=10)
doc_topics=[]
for doc_vector in corpus:
doc_topics.append(ldamodel[doc_vector])
for i in range(0,len(doc_topics)):
doc_topics[i] = sorted(doc_topics[i],key=itemgetter(1),reverse=True)
feature_vect=[]
for i in doc_topics:
prob_vect=[0]*num_topics
#print i
topic_num = i[0][0]
topic_prob = i[0][1]
prob_vect[topic_num]=topic_prob
feature_vect.append(prob_vect)
#print i
#print feature_vect
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)

Related

Sequential Model incompatible with layer

I've recently updated my project to include more intents for my NLU chatbot. I retrained the model. However, when I make an input into the program I receive an error message saying
File "C:\Users\jiann\ChatBot - Copy\chatbot.py", line 39, in predict_clas
s
res = model.predict(np.array([bow]))[0]
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pack
ages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pack
ages\tensorflow\python\framework\func_graph.py", line 1147, in autograph_ha
ndler
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1801, in predict_function *
return step_function(self, iterator)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1790, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1783, in run_step **
outputs = model.predict_step(data)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1751, in predict_step
return self(x, training=False)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
ckages\keras\engine\input_spec.py", line 264, in assert_input_compatibilityckages\keras\engine\input_spec.py", line 264, in assert_input_compatibilityckage
raise ValueError(f'Input {input_index} of layer "{layer_name}" is ' raise ValueError(f'Input {input_index} of layer "{layer_name}" is '
ValueError: Input 0 of layer "sequential" is incompatible with the laye
r: expected shape=(None, 9), found shape=(None, 40)
This error only pops up when I include more than one Intent. Below I've include the relevant code for the Sequential model and the Intents:
Intents.json:
{"intents": [
{"tag": "greeting",
"patterns": ["Hi", "How are you", "Is anyone there?", "Hello", "Good day", "Whats up", "Hey", "greetings"],
"responses": ["Hello!", "Good to see you again!", "Hi there, how can I help?"],
"context_set": ""
},
{"tag": "goodbye",
"patterns": ["cya", "See you later", "Goodbye", "I am Leaving", "Have a Good day", "bye", "cao", "see ya"],
"responses": ["Sad to see you go :(", "Talk to you later", "Goodbye!"],
"context_set": ""
},
{"tag": "stocks",
"patterns": ["what stocks do I own?", "how are my shares?", "what companies am I investing in?", "what am I doing in the markets?"],
"responses": ["You own the following shares: ABBV, AAPL, FB, NVDA and an ETF of the S&P 500 Index!"],
"context_set": ""
}
]
}
training.py:
import random
import json
import pickle
import numpy as np
import nltk
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizer_v2.gradient_descent import SGD
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Lemmatizer uses stem of a word instead of conjugate (performance purposes)
from nltk.stem import WordNetLemmatizer
from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers i
# mport Dense, Activation, Dropout
# from tensorflow.keras.optimizers import SGD
lemmatizer = WordNetLemmatizer()
# Reading json file, pass to load function, get json object dictionary
intents = json.loads(open('intents.json').read())
words = []
classes = []
documents = []
# Characters that you won't pay attention to
ignore_letters = ['?', '!', '.', ',']
# Splits each pattern entry into individual words
for intent in intents['intents']:
for pattern in intent['patterns']:
word_list = nltk.word_tokenize(pattern)
words.extend(word_list)
#Wordlist belongs to specific tag
documents.append((word_list, intent['tag']))
if intent['tag'] not in classes:
classes.append(intent['tag'])
print(documents)
#lemmatizes word inf word list if it is not ignored
words = [lemmatizer.lemmatize(word) for word in words if word not in ignore_letters]
#Set Eliminates duplicate words
words = sorted(set(words))
classes = sorted(set(classes))
#Save the words in file
pickle.dump(words,open('words.pkl','wb'))
#Save classes in file
pickle.dump(classes,open('classes.pkl','wb'))
#CREATING THE TRAINING DATA
#Set individual word values to 0 or 1 depending on whether it occurs
training = []
output_empty = [0] * len(classes)
for document in documents:
bag = []
word_patterns = document[0]
word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
for word in words:#checks to see if word is in pattern
bag.append(1) if word in word_patterns else bag.append(0)
output_row = list(output_empty)
#want to know class at index 1, want to know index,
# add class to oupt_row to 1
output_row[classes.index(document[1])] = 1
training.append([bag, output_row])
#shuffle the data
random.shuffle(training)
#turn into numpy array
training = np.array(training)
#split into x and y values, Features & Labels
train_x =list(training[:,0])
train_y = list(training[:,1])
#Start building Neural Network Model
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]),activation='softmax'))
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbotmodel.h5',hist)
print('done')
chatbot.py:
import random
import pickle
import numpy as np
import nltk
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from nltk.stem import WordNetLemmatizer
from keras.models import load_model
lemmatizer = WordNetLemmatizer()
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
model = load_model('chatbot_model.model')
print(classes)
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bag_of_words(sentence):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bag_of_words(sentence)
res = model.predict(np.array([bow]))[0]
# allows for certain uncertainty.
# If Uncertainty is too high it won't allow to be taken into account
ERROR_THRESHOLD = 0.25
results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
return return_list
def get_response(intents_list, intents_json):
tag = intents_list[0]['intent']
list_of_intents = intents_json['intents']
for i in list_of_intents:
if i['tag'] == tag:
result = random.choice(i['responses'])
break
return result
print("Go! Bot is running!")
If I had to take a guess, it would be something wrong with the shape. I'm just not sure how to fix this.
There seems to be a mismatch between the input_shape of your model and the training sample(s) you are providing. I believe the issue stems from these two lines:
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
and,
res = model.predict(np.array([bow]))[0]
Depending on what value is returned by len(train_x[0]), calling model.predict() on np.array[bow] may not work if np.array[bow] does not match the input shape specified. Check out this answer for an in-depth explanation of how the various Keras inputs work.

How to implement LIME in a Bert model?

I am new to machine learning. I noticed that such questions have been asked before as well but did not receive a proper solution. Below is the code for semantic similarity and I want to implement LIME as a base. Please, help me out.
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
# Two lists of sentences
sentences1 = ['The cat sits outside',
'A man is playing guitar',
'The new movie is awesome']
sentences2 = ['The cat sits outside',
'A woman watches TV',
'The new movie is so great']
#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
#Output the pairs with their score
for i in range(len(sentences1)):
print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
I don't know what Bert is, but try this sample code and see if it helps you.
import pandas as pd
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.utils import shuffle
from io import StringIO
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
df = pd.read_csv('C:\\Users\\ryans\\OneDrive\\Desktop\\Briefcase\\PDFs\\1-ALL PYTHON & R CODE SAMPLES\\A - GITHUB\\Natural Language Processing - Amazon Reviews\\Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
# let's experiment with some sentiment analysis concepts
# first we need to clean up the stuff in the independent field of the DF we are workign with
df.replace('\'','', regex=True, inplace=True)
df['review_title'] = df[['reviews.title']].astype(str)
df['review_text'] = df[['reviews.text']].astype(str)
df['review_title'] = df['reviews.title'].str.replace('\d+', '')
df['review_text'] = df['reviews.text'].str.replace('\d+', '')
# get rid of special characters
df['review_title'] = df['reviews.title'].str.replace(r'[^\w\s]+', '')
df['review_text'] = df['reviews.text'].str.replace(r'[^\w\s]+', '')
# get rid of double spaces
df['review_title'] = df['reviews.title'].str.replace(r'\^[a-zA-Z]\s+', '')
df['review_text'] = df['reviews.text'].str.replace(r'\^[a-zA-Z]\s+', '')
# convert all case to lower
df['review_title'] = df['reviews.title'].str.lower()
df['review_text'] = df['reviews.text'].str.lower()
list_corpus = df["review_text"].tolist()
list_labels = df["reviews.rating"].tolist()
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english', binary=True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors, y_train)
pred = logreg.predict(test_vectors)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
list_corpus[3]
c = make_pipeline(vectorizer, logreg)
class_names=list(df.review_title.unique())
explainer = LimeTextExplainer(class_names=class_names)
idx = 3
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, labels=[1, 1])
print('Document id: %d' % idx)
print('Predicted class =', class_names[logreg.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_test[idx]])
print ('Explanation for class %s' % class_names[1])
print ('\n'.join(map(str, exp.as_list(label=1))))
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, top_labels=2)
print(exp.available_labels())
exp.show_in_notebook(text=False)
https://towardsdatascience.com/explain-nlp-models-with-lime-shap-5c5a9f84d59b
https://marcotcr.github.io/lime/tutorials/Lime%20-%20multiclass.html
https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b

How to save self-trained word2vec to a txt file with format like 'word2vec-google-news' or 'glove.6b.50d'

I wonder that how can I save a self-trained word2vec to txt file with the format like 'word2vec-google-news' or 'glove.6b.50d' which has the tokens followed by matched vectors.
I export my self-trained vectors to txt file which only has vectors but no tokens in the front of those vectors.
My code for training my own word2vec:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import random
import numpy as np
from six.moves import xrange
import zipfile
import tensorflow as tf
import pandas as pd
filename = ('data/data.zip')
# Step 1: Read the data into a list of strings.
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
#print('Data size', len(words))
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
#print("count",len(count))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
#del words # Hint to reduce memory.
#print('Most common words (+UNK)', count[:5])
#print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
data_index = 0
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
#for i in range(8):
#print(batch[i], reverse_dictionary[batch[i]],'->', labels[i, 0], reverse_dictionary[labels[i, 0]])
# Step 4: Build and train a skip-gram model.
batch_size = 128
embedding_size = 128
skip_window = 2
num_skips = 2
valid_size = 9
valid_window = 100
num_sampled = 64 # Number of negative examples to sample.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
graph = tf.Graph()
with graph.as_default():
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]),dtype=tf.float32)
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,biases=nce_biases, inputs=embed, labels=train_labels,
num_sampled=num_sampled, num_classes=vocabulary_size))
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
# Step 5: Begin training.
num_steps = 20000
with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
init.run()
#print("Initialized")
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
#if step % 2000 == 0:
# if step > 0:
# average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
# print("Average loss at step ", step, ": ", average_loss)
#average_loss = 0
final_embeddings = normalized_embeddings.eval()
np.savetxt('data/w2v.txt', final_embeddings)
You may want to look at the implementation of _save_word2vec_format() in gensim for an example of Python code which writes that format:
https://github.com/RaRe-Technologies/gensim/blob/e859c11f6f57bf3c883a718a9ab7067ac0c2d4cf/gensim/models/utils_any2vec.py#L104
def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None):
"""Store the input-hidden weight matrix in the same format used by the original
C word2vec-tool, for compatibility.
Parameters
----------
fname : str
The file path used to save the vectors in.
vocab : dict
The vocabulary of words.
vectors : numpy.array
The vectors to be stored.
fvocab : str, optional
File path used to save the vocabulary.
binary : bool, optional
If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
total_vec : int, optional
Explicitly specify total number of vectors
(in case word vectors are appended with document vectors afterwards).
"""
if not (vocab or vectors):
raise RuntimeError("no input")
if total_vec is None:
total_vec = len(vocab)
vector_size = vectors.shape[1]
if fvocab is not None:
logger.info("storing vocabulary in %s", fvocab)
with utils.open(fvocab, 'wb') as vout:
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
assert (len(vocab), vector_size) == vectors.shape
with utils.open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
row = vectors[vocab_.index]
if binary:
row = row.astype(REAL)
fout.write(utils.to_utf8(word) + b" " + row.tostring())
else:
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

Size of weights extracted from a NN model becomes higher than the model

I tried extracting the weights from a .pb tensorflow model and stored them in a text file..the size of the text file itself is higher than the model..why is this happening..?
Thanks in advance
Code to extract weights :
import tensorflow as tf
from tensorflow.python.platform import gfile
from tensorflow.python.framework import tensor_util
import operator
from functools import reduce
import matplotlib.pyplot as plt
import zlib
import pickle
PB_PATH = 'quantized_graph_resnet.pb'
with tf.Session() as sess:
with gfile.FastGFile(PB_PATH,'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
sess.graph.as_default()
tf.import_graph_def(graph_def, name='')
graph_nodes = [n for n in graph_def.node]
wts = [n for n in graph_nodes if n.op=='Const']
def check(l):
for item in l:
if(type(item) is list):
return True
return False
weightsFreq = {}
f = open('weights.txt', 'w')
for n in wts:
print("Name of the node - %s" % n.name)
if(True):
l = (tensor_util.MakeNdarray(n.attr['value'].tensor)).tolist()
if(isinstance(l, int)):
f.write('%d' % l)
f.write(' ')
if l in weightsFreq:
weightsFreq[l]+=1
else:
weightsFreq[l]=1
continue
if(isinstance(l, float)):
continue
while(check(l)):
l = reduce(operator.concat, l)
for item in l :
f.write('%d' % item)
f.write(' ')
# print(item)
if item in weightsFreq:
weightsFreq[item]+=1
else:
weightsFreq[item]=1
# print("Value - ", tensor_util.MakeNdarray(n.attr['value'].tensor), type(tensor_util.MakeNdarray(n.attr['value'].tensor)), "\n")
Text files are a very inefficient way to store large quantities of decimal numbers, it uses one byte for each digit of each number, where a binary file would use a fixed-size representation (4 bytes per number with a single precision floating point number).
That's why the text file is much bigger than a binary one.

Tensorflow Shape Error (Latest Version) Multivariate Linear Regression

Keep in mind I am extremely new to tf.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
np.set_printoptions(suppress=True)
boston = load_boston()
m = boston.data.shape[0] - 1
bt_unfixed = np.transpose(boston.data)
bt = np.insert(bt_unfixed, 0, 1)
Y = tf.placeholder(tf.float64)
X = tf.placeholder(tf.float64, [None, 13])
print X.shape
W = tf.Variable(np.transpose(np.array([0.00,0,0,0,0,0,0,0,0,0,0,0,0]).flatten()), name='weights')
b = tf.Variable(0.5, name='bias')
hypothesis = tf.add(tf.matmul(X, W), tf.cast(b, tf.float64))
loss = tf.reduce_sum(tf.square(hypothesis - Y)) / (2 * m)
optimizer = tf.train.GradientDescentOptimizer(0.01)
train_op = optimizer.minimize(loss)
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
for i in range(0, 1200):
for (x, y) in zip(boston.data, boston.target.data):
sess.run(train_op, feed_dict={X:x, Y:y})
print "Done!\n"
print "Running test...\n"
t = sess.run(cost, feed_dict={X:boston.data[504], Y:boston.target.data[504]})
print "loss =" + str(t) + "Real value" + str(boston.target.data[504]) + "Pred " +str(sess.run(hypothesis, feed_dict={X:boston.data[504]}))
Please feel free to correct any other errors in my code!
The error is thrown when I define hypothesis, I need to transpose my vector to multiply them together, but it is not working.
Traceback (most recent call last):
File "multihouse.py", line 20, in <module>
hypothesis = tf.add(tf.matmul(X, W), tf.cast(b, tf.float64))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 1398, in matmul
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1348, in _mat_mul
transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 749, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2382, in create_op
set_shapes_for_outputs(ret)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1783, in set_shapes_for_outputs
shapes = shape_func(op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/common_shapes.py", line 596, in call_cpp_shape_fn
raise ValueError(err.message)
ValueError: Shape must be rank 2 but is rank 1
The problem is there is no such thing as transposition of a vector in numpy. you have to define a matrix with one trivial dimension to work with such things, for example
np.array([0.00,0,0,0,0,0,0,0,0,0,0,0,0]) # this cannot be transposed
np.array([[0.00,0,0,0,0,0,0,0,0,0,0,0,0]]) # this can
just like in this example
>>> import numpy as np
>>> np.transpose(np.array([1,1,1]))
array([1, 1, 1])
>>> np.transpose(np.array([[1,1,1]]))
array([[1],
[1],
[1]])
Once you fix this, your hypothesis will be correctly defined (right now you try to multiply matrix with a vector, which is not defined in TF).

Resources