Sequential Model incompatible with layer - machine-learning

I've recently updated my project to include more intents for my NLU chatbot. I retrained the model. However, when I make an input into the program I receive an error message saying
File "C:\Users\jiann\ChatBot - Copy\chatbot.py", line 39, in predict_clas
s
res = model.predict(np.array([bow]))[0]
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pack
ages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pack
ages\tensorflow\python\framework\func_graph.py", line 1147, in autograph_ha
ndler
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1801, in predict_function *
return step_function(self, iterator)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1790, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1783, in run_step **
outputs = model.predict_step(data)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\engine\training.py", line 1751, in predict_step
return self(x, training=False)
File "c:\users\jiann\appdata\local\programs\python\python39\lib\site-pa
ckages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
ckages\keras\engine\input_spec.py", line 264, in assert_input_compatibilityckages\keras\engine\input_spec.py", line 264, in assert_input_compatibilityckage
raise ValueError(f'Input {input_index} of layer "{layer_name}" is ' raise ValueError(f'Input {input_index} of layer "{layer_name}" is '
ValueError: Input 0 of layer "sequential" is incompatible with the laye
r: expected shape=(None, 9), found shape=(None, 40)
This error only pops up when I include more than one Intent. Below I've include the relevant code for the Sequential model and the Intents:
Intents.json:
{"intents": [
{"tag": "greeting",
"patterns": ["Hi", "How are you", "Is anyone there?", "Hello", "Good day", "Whats up", "Hey", "greetings"],
"responses": ["Hello!", "Good to see you again!", "Hi there, how can I help?"],
"context_set": ""
},
{"tag": "goodbye",
"patterns": ["cya", "See you later", "Goodbye", "I am Leaving", "Have a Good day", "bye", "cao", "see ya"],
"responses": ["Sad to see you go :(", "Talk to you later", "Goodbye!"],
"context_set": ""
},
{"tag": "stocks",
"patterns": ["what stocks do I own?", "how are my shares?", "what companies am I investing in?", "what am I doing in the markets?"],
"responses": ["You own the following shares: ABBV, AAPL, FB, NVDA and an ETF of the S&P 500 Index!"],
"context_set": ""
}
]
}
training.py:
import random
import json
import pickle
import numpy as np
import nltk
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizer_v2.gradient_descent import SGD
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Lemmatizer uses stem of a word instead of conjugate (performance purposes)
from nltk.stem import WordNetLemmatizer
from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers i
# mport Dense, Activation, Dropout
# from tensorflow.keras.optimizers import SGD
lemmatizer = WordNetLemmatizer()
# Reading json file, pass to load function, get json object dictionary
intents = json.loads(open('intents.json').read())
words = []
classes = []
documents = []
# Characters that you won't pay attention to
ignore_letters = ['?', '!', '.', ',']
# Splits each pattern entry into individual words
for intent in intents['intents']:
for pattern in intent['patterns']:
word_list = nltk.word_tokenize(pattern)
words.extend(word_list)
#Wordlist belongs to specific tag
documents.append((word_list, intent['tag']))
if intent['tag'] not in classes:
classes.append(intent['tag'])
print(documents)
#lemmatizes word inf word list if it is not ignored
words = [lemmatizer.lemmatize(word) for word in words if word not in ignore_letters]
#Set Eliminates duplicate words
words = sorted(set(words))
classes = sorted(set(classes))
#Save the words in file
pickle.dump(words,open('words.pkl','wb'))
#Save classes in file
pickle.dump(classes,open('classes.pkl','wb'))
#CREATING THE TRAINING DATA
#Set individual word values to 0 or 1 depending on whether it occurs
training = []
output_empty = [0] * len(classes)
for document in documents:
bag = []
word_patterns = document[0]
word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
for word in words:#checks to see if word is in pattern
bag.append(1) if word in word_patterns else bag.append(0)
output_row = list(output_empty)
#want to know class at index 1, want to know index,
# add class to oupt_row to 1
output_row[classes.index(document[1])] = 1
training.append([bag, output_row])
#shuffle the data
random.shuffle(training)
#turn into numpy array
training = np.array(training)
#split into x and y values, Features & Labels
train_x =list(training[:,0])
train_y = list(training[:,1])
#Start building Neural Network Model
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]),activation='softmax'))
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbotmodel.h5',hist)
print('done')
chatbot.py:
import random
import pickle
import numpy as np
import nltk
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from nltk.stem import WordNetLemmatizer
from keras.models import load_model
lemmatizer = WordNetLemmatizer()
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
model = load_model('chatbot_model.model')
print(classes)
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bag_of_words(sentence):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bag_of_words(sentence)
res = model.predict(np.array([bow]))[0]
# allows for certain uncertainty.
# If Uncertainty is too high it won't allow to be taken into account
ERROR_THRESHOLD = 0.25
results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
return return_list
def get_response(intents_list, intents_json):
tag = intents_list[0]['intent']
list_of_intents = intents_json['intents']
for i in list_of_intents:
if i['tag'] == tag:
result = random.choice(i['responses'])
break
return result
print("Go! Bot is running!")
If I had to take a guess, it would be something wrong with the shape. I'm just not sure how to fix this.

There seems to be a mismatch between the input_shape of your model and the training sample(s) you are providing. I believe the issue stems from these two lines:
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
and,
res = model.predict(np.array([bow]))[0]
Depending on what value is returned by len(train_x[0]), calling model.predict() on np.array[bow] may not work if np.array[bow] does not match the input shape specified. Check out this answer for an in-depth explanation of how the various Keras inputs work.

Related

ValueError: 'logits' and 'labels' must have the same shape for NLP sentiment multi-class classifier

I am trying to make a NLP multi-class sentiment classifier where it takes in sentences as input and classifies them into three classes (negative, neutral and positive). However, when training the model, I run into the error where my logits (None, 3) are not the same size as my labels (None, 1) and the model can't begin training.
My model is a multi-class classifier and not a multi-label classifier since it is only predicting one label per object. I made sure that my last layer had an output of 3 and had the activation = 'softmax'. This should be correct from what I have searched online so I think that the problem lies with my labels.
Currently, my labels have a dimension of (None, 1) since I mapped each class to a unique integer and passed this as my test and train y values (which are in the form of one dimensional numpy array.
Right now I am confused if I have change the dimensions of this array to match the output dimensions and how to go about doing it.
import os
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
print("Found GPU at: {}".format(device_name))
else:
device_name = "/device:CPU:0"
print("No GPU, using {}.".format(device_name))
# Load dataset into a dataframe
train_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/train.csv"
test_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/test.csv"
train_df = pd.read_csv(train_data_path, encoding='unicode_escape')
test_df = pd.read_csv(test_data_path, encoding='unicode_escape').dropna()
sentiment_types = ('neutral', 'negative', 'positive')
train_df['sentiment'] = train_df['sentiment'].astype('category')
test_df['sentiment'] = test_df['sentiment'].astype('category')
train_df['sentiment_cat'] = train_df['sentiment'].cat.codes
test_df['sentiment_cat'] = test_df['sentiment'].cat.codes
train_y = np.array(train_df['sentiment_cat'])
test_y = np.array(test_df['sentiment_cat'])
# Function to convert df into a list of strings
def convert_to_list(df, x):
selected_text_list = []
labels = []
for index, row in df.iterrows():
selected_text_list.append(str(row[x]))
labels.append(str(row['sentiment']))
return np.array(selected_text_list), np.array(labels)
train_sentences, train_labels = convert_to_list(train_df, 'selected_text')
test_sentences, test_labels = convert_to_list(test_df, 'text')
# Instantiate tokenizer and create word_index
tokenizer = Tokenizer(num_words=1000, oov_token='<oov>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# Convert sentences into a sequence
train_sequence = tokenizer.texts_to_sequences(train_sentences)
test_sequence = tokenizer.texts_to_sequences(test_sentences)
# Padding sequences
pad_test_seq = pad_sequences(test_sequence, padding='post')
max_len = pad_test_seq[0].size
pad_train_seq = pad_sequences(train_sequence, padding='post', maxlen=max_len)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 64, input_length=max_len),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(3, activation='softmax')
])
with tf.device(device_name):
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
with tf.device(device_name):
history = model.fit(pad_train_seq, train_y, epochs=num_epochs, validation_data=(pad_test_seq, test_y), verbose=2)
Here is the error:
ValueError Traceback (most recent call last)
<ipython-input-28-62f3c6445887> in <module>
2
3 with tf.device(device_name):
----> 4 history = model.fit(pad_train_seq, train_y, epochs=num_epochs, validation_data=(pad_test_seq, test_y), verbose=2)
1 frames
/usr/local/lib/python3.8/dist-packages/keras/engine/training.py in tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
ValueError: in user code:
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1051, in train_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1040, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1030, in run_step **
outputs = model.train_step(data)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 890, in train_step
loss = self.compute_loss(x, y, y_pred, sample_weight)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 948, in compute_loss
return self.compiled_loss(
File "/usr/local/lib/python3.8/dist-packages/keras/engine/compile_utils.py", line 201, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 139, in __call__
losses = call_fn(y_true, y_pred)
File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 243, in call **
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 1930, in binary_crossentropy
backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
File "/usr/local/lib/python3.8/dist-packages/keras/backend.py", line 5283, in binary_crossentropy
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
ValueError: `logits` and `labels` must have the same shape, received ((None, 3) vs (None, 1)).
my logits (None, 3) are not the same size as my labels (None, 1)
I made sure that my last layer had an output of 3 and had the activation = 'softmax'
my labels have a dimension of (None, 1) since I mapped each class to a unique integer
The key concept you are missing is that you need to one-hot encode your labels (after assigning integers to them - see below).
So your model, after the softmax, is spitting out three values: how probable each of your labels is. E.g. it might say A is 0.6, B is 0.1, and C is 0.3. If the correct answer is C, then it needs to see that correct answer as 0, 0, 1. It can then say that its prediction for A is 0.6 - 0 = +0.6 wrong, B is 0.1 - 0 = +0.1 wrong, and C is 0.3 - 1 = -0.7 wrong.
Theoretically you can go from a string label directly to a one-hot encoding. But it seems Tensorflow needs the labels to first be encoded as integers, and then that is one-hot encoded.
https://www.tensorflow.org/api_docs/python/tf/keras/layers/CategoryEncoding#examples says to use:
tf.keras.layers.CategoryEncoding(num_tokens=3, output_mode="one_hot")
Also see https://stackoverflow.com/a/69791457/841830 (the higher-voted answer there is from 2019, so applies to TensorFlow v1 I think). And searching for "tensorflow one-hot encoding" will bring up plenty of tutorials and examples.
The issue here was indeed due to the shape of my labels not being the same as logits. Logits were of shape (3) since they contained a float for the probability of each of the three classes that I wanted to predict. Labels were originally of shape (1) since it only contained one int.
To solve this, I used one-hot encoding which turned all labels into a shape of (3) and this solved the problem. Used the keras.utils.to_categorical() function to do so.
sentiment_types = ('negative', 'neutral', 'positive')
train_df['sentiment'] = train_df['sentiment'].astype('category')
test_df['sentiment'] = test_df['sentiment'].astype('category')
# Turning labels from strings to int
train_sentiment_cat = train_df['sentiment'].cat.codes
test_sentiment_cat = test_df['sentiment'].cat.codes
# One-hot encoding
train_y = to_categorical(train_sentiment_cat)
test_y = to_categorical(test_sentiment_cat)

How to properly feed specific tensor to keras model

To allow using Keras model as part of standard tensorflow operations, I create a model using specific placeholder for the input.
However, when trying to do model.predict, I get an error:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [100,84,84,4]
[[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[100,84,84,4], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
My code is given below:
from keras.layers import Convolution2D, Dense, Input
from keras.models import Model
from keras.optimizers import Nadam
from keras.losses import mean_absolute_error
from keras.activations import relu
import tensorflow as tf
import numpy as np
import gym
state_size = [100, 84, 84, 4]
input_tensor = tf.placeholder(dtype=tf.float32, shape=state_size)
inputL = Input(tensor=input_tensor)
h1 = Convolution2D(filters=32, kernel_size=(5,5), strides=(4,4), activation=relu) (inputL)
h2 = Convolution2D(filters=64, kernel_size=(3,3), strides=(2,2), activation=relu) (h1)
h3 = Convolution2D(filters=64, kernel_size=(3,3), activation=relu) (h2)
h4 = Dense(512, activation=relu) (h3)
out = Dense(18) (h4)
model = Model(inputL, out)
opt = Nadam()
disc_rate=0.99
sess = tf.Session()
dummy_input = np.ones(shape=state_size)
model.compile(opt, mean_absolute_error)
writer = tf.summary.FileWriter('./my_graph', sess.graph)
writer.close()
print(out)
print(model.predict({input_tensor: dummy_input}))
I have also trying feeding the input directly(no dictionary, just the value) - same exception. I can, however, get the model to work like:
print(sess.run( model.output, {input_tensor: dummy_input }))
Is there a way for me to still use normal Keras .predict method?
The following works (we need to initialize global variables):
sess.run(tf.global_variables_initializer()) # initialize
print(sess.run([model.output], feed_dict={input_tensor: dummy_input}))

Custom feature extraction class in scikit-learn

I am very beginner to the scikit-learn .I am working on some classification problem for which I have to build some custom feature extraction class or method to find the features for the training data.
I have made my custom feature extraction class as explain in this link. When i run my code it shows me this error :-
Traceback (most recent call last):
File "test.py", line 248, in <module>
pred = pipe.predict(X_test)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.py", line 327, in predict
return self.steps[-1][-1].predict(Xt)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 336, in predict
scores = self.decision_function(X)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 317, in decision_function
% (X.shape[1], n_features))
ValueError: X has 174 features per sample; expecting 443
Below is my code snippet , also i have given my full code. Please tell me where i am doing wrong and why , along with the suggestions so that my code will run without any error.
Code snippet :-
Here "y" is a list of all categories or labelled group ."corpus" is the list of all documents (data) , where each doc. is represented like a string."tfidf" and "lda" are my two functions from which i am generating my feature vector
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)
full code :-
# -*- coding: utf-8 -*-
#! /usr/bin/env python3
from gensim import corpora, models
import gensim
from operator import itemgetter
import numpy as np
import sys
import os
import re
import codecs
import io
import math
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn import svm
from sklearn import metrics
from sklearn.pipeline import make_pipeline , Pipeline
reload(sys)
sys.setdefaultencoding('utf8')
np.set_printoptions(threshold='nan')
suffixes = {
1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}
categories=['A','C','D','E']
mappings={}
mappings['A']=1
mappings['C']=3
mappings['D']=4
mappings['E']=5
path='/home/priyansh/Downloads/ltrc/1055/'
train_data_path='/home/priyansh/Downloads/ltrc/extractor/clustering/four_class_devanagari/'
path1=train_data_path+"A/"
path2=train_data_path+"C/"
path3=train_data_path+"D/"
path4=train_data_path+"E/"
documents=[] #contains all doc filenames along with class labels
doc_info_with_label=[] #two tuple storage of doc info along with their respective labels
def hi_stem(word):
for L in 5, 4, 3, 2, 1:
if len(word) > L + 1:
for suf in suffixes[L]:
if word.endswith(suf):
return word[:-L]
return word
def store_data(dir_path_list):
for dir_path in dir_path_list:
class_name = dir_path.split("/")[8]
for filename in os.listdir(dir_path):
if filename not in documents:
documents.append(filename+"+"+str(mappings[class_name]))
infilename=os.path.join(dir_path,filename)
with codecs.open(infilename,'r','utf-8') as fl:
string=''
for line in fl:
for word in line.split():
if word!=" " or word!="\n":
string+=word+" "
fl.close()
temp=[]
temp.append(class_name)
temp.append(string)
doc_info_with_label.append(tuple(temp))
path_list=[]
path_list.append(path1)
path_list.append(path2)
path_list.append(path3)
path_list.append(path4)
store_data(path_list)
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
def tfidf_score(word,document_no,corpus_data):
#print word
my_word=word
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
document=corpus_data[document_no]
#print document
wordcount=0
total=0
temp = document.split()
for i in temp:
#print i
if i not in stopwords:
total+=1
if i==my_word:
#print my_word
#print word
wordcount+=1
#print wordcount
#print total
tf = float(wordcount)/total
#print tf
#return tf(word,document)*idf(word,corpus_data)
total_docs = len(corpus_data)
count=0
for doc in corpus_data:
temp=[]
temp = doc.split()
for i in temp:
if i==word:
count+=1
break
total_docs_which_contains_the_words=count
idf = math.log(total_docs/(1+total_docs_which_contains_the_words))
return tf*idf
def tfidf(corpus_data):
word_id_mapping={}
cnt=0
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
unique_words_in_corpus={}
count=0
for data in corpus_data:
corpus_id=count
temp=[]
temp=data.split()
for word in temp:
if word not in unique_words_in_corpus:
unique_words_in_corpus[word]=corpus_id
count+=1
stopped_unique_words_in_corpus={}
for word in unique_words_in_corpus:
if word not in stopwords:
stopped_unique_words_in_corpus[word]=unique_words_in_corpus[word]
word_id_mapping[word]=cnt
cnt+=1
#print unique_words_in_corpus
#print stopped_unique_words_in_corpus
#print word_id_mapping
feature_vect=[None]*len(corpus_data)
#score_vect=[None]*cnt
for i in range(0,len(corpus_data)):
score_vect=[0]*cnt
for word in stopped_unique_words_in_corpus:
if i==stopped_unique_words_in_corpus[word]:
#print word
score=tfidf_score(word,i,corpus_data)
#print score
score_vect[word_id_mapping[word]]=score
feature_vect[i]=score_vect
return feature_vect
def lda(corpus_data):
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
texts=[]
for data in corpus_data:
#print data
tokens=[]
temp=[]
stopped_tokens=[]
temp = data.split()
for word in temp:
tokens.append(word)
#print tokens
for i in tokens:
if i not in stopwords:
stopped_tokens.append(i)
stemmed_tokens=[]
for token in stopped_tokens:
stemmed_token = hi_stem(token)
stemmed_tokens.append(stemmed_token)
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
num_topics=5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=10)
doc_topics=[]
for doc_vector in corpus:
doc_topics.append(ldamodel[doc_vector])
for i in range(0,len(doc_topics)):
doc_topics[i] = sorted(doc_topics[i],key=itemgetter(1),reverse=True)
feature_vect=[]
for i in doc_topics:
prob_vect=[0]*num_topics
#print i
topic_num = i[0][0]
topic_prob = i[0][1]
prob_vect[topic_num]=topic_prob
feature_vect.append(prob_vect)
#print i
#print feature_vect
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)

How to encode categorical data for use with Semi-supervised algorithm LabelPropagation

I am attempting to use the anneal.arff dataset with Python scikit-learn's semisupervised algorithm LabelPropagation. The anneal dataset is categorical data, so I preprocessed it so that the output class for each item of instance
looks like [0. 0. 1. 0. 0.]. This is a numeric list that encodes the output class
as 5 possible values with 0's everywhere, and 1. in the position of the corresponding class. This is what I would expect.
For semi-supervised learning, most of the training data must be unlabeled, so
I modified the training set so that the unlabeled data has output [-1, -1, -1, -1, -1]. I previously tried just using -1, but the code emits the same error as shown below.
I train the classifier as follows, Y_train includes labeled and "unlabeled" data:
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X, Y_train)
I receive the error shown below after calling the fit method:
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\semi_supervised\label_propagation.py", line 221, in fit
X, y = check_X_y(X, y)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 526, in check_X_y
y = column_or_1d(y, warn=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 562, in column_or_1d
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (538, 5)
This suggests that something is wrong with the shape of my Y_train list,
but this is the correct shape. What am I doing wrong?
Can LabelPropagation take as training data in this form, or does it only
accept unlabeled data as a scalar -1?
--- edit ---
Here is the code that generates the error. I'm sorry about the confusion over algorithms--I want to use both LabelSpreading and LabelPropagation, and choosing one or the other doesn't fix this error.
from scipy.io import arff
import pandas as pd
import numpy as np
import math
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from copy import deepcopy
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
f = "../../Documents/UCI/anneal.arff"
dataAsRecArray, meta = arff.loadarff(f)
dataset_raw = pd.DataFrame.from_records(dataAsRecArray)
dataset = pd.get_dummies(dataset_raw)
class_names = [col for col in dataset.columns if 'class_' in col]
print (dataset.shape)
number_of_output_columns = len(class_names)
print (number_of_output_columns)
def run(name, model, dataset, percent):
# Split-out validation dataset
array = dataset.values
X = array[:, 0:-number_of_output_columns]
Y = array[:, -number_of_output_columns:]
validation_size = 0.40
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
num_samples = len(Y_train)
num_labeled_points = math.floor(percent*num_samples)
indices = np.arange(num_samples)
unlabeled_set = indices[num_labeled_points:]
Y_train[unlabeled_set] = [-1, -1, -1, -1, -1]
lp_model = LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X_train, Y_train)
"""
predicted_labels = lp_model.transduction_[unlabeled_set]
print(predicted_labels[:10])
"""
if __name__ == "__main__":
#percentages = [0.1, 0.2, 0.3, 0.4]
percentages = [0.1]
models = []
models.append(('LS', LabelSpreading()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
for percent in percentages:
run(name, model, dataset, percent)
print ("bye")
Your Y_train has shape (538, 5) but should be 1d. LabelPropagation doesn't support multi-label or multi-output multi-class right now.
The error message could be more informative, though :-/

How do you make TensorFlow + Keras fast with a TFRecord dataset?

What is an example of how to use a TensorFlow TFRecord with a Keras Model and tf.session.run() while keeping the dataset in tensors w/ queue runners?
Below is a snippet that works but it needs the following improvements:
Use the Model API
specify an Input()
Load a dataset from a TFRecord
Run through a dataset in parallel (such as with a queuerunner)
Here is the snippet, there are several TODO lines indicating what is needed:
from keras.models import Model
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense, Input
from keras.objectives import categorical_crossentropy
from tensorflow.examples.tutorials.mnist import input_data
sess = tf.Session()
K.set_session(sess)
# Can this be done more efficiently than placeholders w/ TFRecords?
img = tf.placeholder(tf.float32, shape=(None, 784))
labels = tf.placeholder(tf.float32, shape=(None, 10))
# TODO: Use Input()
x = Dense(128, activation='relu')(img)
x = Dense(128, activation='relu')(x)
preds = Dense(10, activation='softmax')(x)
# TODO: Construct model = Model(input=inputs, output=preds)
loss = tf.reduce_mean(categorical_crossentropy(labels, preds))
# TODO: handle TFRecord data, is it the same?
mnist_data = input_data.read_data_sets('MNIST_data', one_hot=True)
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
sess.run(tf.global_variables_initializer())
# TODO remove default, add queuerunner
with sess.as_default():
for i in range(1000):
batch = mnist_data.train.next_batch(50)
train_step.run(feed_dict={img: batch[0],
labels: batch[1]})
print(loss.eval(feed_dict={img: mnist_data.test.images,
labels: mnist_data.test.labels}))
Why is this question relevant?
For high performance training without going back to python
no TFRecord to numpy to tensor conversions
Keras will soon be part of tensorflow
Demonstrate how Keras Model() classes can accept tensors for input data correctly.
Here is some starter information for a semantic segmentation problem example:
example unet Keras model unet.py, happens to be for semantic segmentation.
Keras + Tensorflow Blog Post
An attempt at running the unet model a tf session with TFRecords and a Keras model (not working)
Code to create the TFRecords: tf_records.py
An attempt at running the unet model a tf session with TFRecords and a Keras model is in densenet_fcn.py (not working)
I don't use tfrecord dataset format so won't argue on the pros and cons, but I got interested in extending Keras to support the same.
github.com/indraforyou/keras_tfrecord is the repository. Will briefly explain the main changes.
Dataset creation and loading
data_to_tfrecord and read_and_decode here takes care of creating tfrecord dataset and loading the same. Special care must be to implement the read_and_decode otherwise you will face cryptic errors during training.
Initialization and Keras model
Now both tf.train.shuffle_batch and Keras Input layer returns tensor. But the one returned by tf.train.shuffle_batch don't have metadata needed by Keras internally. As it turns out, any tensor can be easily turned into a tensor with keras metadata by calling Input layer with tensor param.
So this takes care of initialization:
x_train_, y_train_ = ktfr.read_and_decode('train.mnist.tfrecord', one_hot=True, n_class=nb_classes, is_train=True)
x_train_batch, y_train_batch = K.tf.train.shuffle_batch([x_train_, y_train_],
batch_size=batch_size,
capacity=2000,
min_after_dequeue=1000,
num_threads=32) # set the number of threads here
x_train_inp = Input(tensor=x_train_batch)
Now with x_train_inp any keras model can be developed.
Training (simple)
Lets say train_out is the output tensor of your keras model. You can easily write a custom training loop on the lines of:
loss = tf.reduce_mean(categorical_crossentropy(y_train_batch, train_out))
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
# sess.run(tf.global_variables_initializer())
sess.run(tf.initialize_all_variables())
with sess.as_default():
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
start_time = time.time()
_, loss_value = sess.run([train_op, loss], feed_dict={K.learning_phase(): 0})
duration = time.time() - start_time
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
duration))
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
coord.request_stop()
coord.join(threads)
sess.close()
Training (keras style)
One of the features of keras that makes it so lucrative is its generalized training mechanism with the callback functions.
But to support tfrecords type training there are several changes that are need in the fit function
running the queue threads
no feeding in batch data through feed_dict
supporting validation becomes tricky as the validation data will also be coming in through another tensor an different model needs to be internally created with shared upper layers and validation tensor fed in by other tfrecord reader.
But all this can be easily supported by another flag parameter. What makes things messing are the keras features sample_weight and class_weight they are used to weigh each sample and weigh each class. For this in compile() keras creates placeholders (here) and placeholders are also implicitly created for the targets (here) which is not needed in our case the labels are already fed in by tfrecord readers. These placeholders needs to be fed in during session run which is unnecessary in our cae.
So taking into account these changes, compile_tfrecord(here) and fit_tfrecord(here) are the extension of compile and fit and shares say 95% of the code.
They can be used in the following way:
import keras_tfrecord as ktfr
train_model = Model(input=x_train_inp, output=train_out)
ktfr.compile_tfrecord(train_model, optimizer='rmsprop', loss='categorical_crossentropy', out_tensor_lst=[y_train_batch], metrics=['accuracy'])
train_model.summary()
ktfr.fit_tfrecord(train_model, X_train.shape[0], batch_size, nb_epoch=3)
train_model.save_weights('saved_wt.h5')
You are welcome to improve on the code and pull requests.
Update 2018-08-29 this is now directly supported in keras, see the following example:
https://github.com/keras-team/keras/blob/master/examples/mnist_tfrecord.py
Original Answer:
TFRecords are supported by using an external loss. Here are the key lines constructing an external loss:
# tf yield ops that supply dataset images and labels
x_train_batch, y_train_batch = read_and_decode_recordinput(...)
# create a basic cnn
x_train_input = Input(tensor=x_train_batch)
x_train_out = cnn_layers(x_train_input)
model = Model(inputs=x_train_input, outputs=x_train_out)
loss = keras.losses.categorical_crossentropy(y_train_batch, x_train_out)
model.add_loss(loss)
model.compile(optimizer='rmsprop', loss=None)
Here is an example for Keras 2. It works after applying the small patch #7060:
'''MNIST dataset with TensorFlow TFRecords.
Gets to 99.25% test accuracy after 12 epochs
(there is still a lot of margin for parameter tuning).
'''
import os
import copy
import time
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
from keras import backend as K
from keras.models import Model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Input
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard
from keras.objectives import categorical_crossentropy
from keras.utils import np_utils
from keras.utils.generic_utils import Progbar
from keras import callbacks as cbks
from keras import optimizers, objectives
from keras import metrics as metrics_module
from keras.datasets import mnist
if K.backend() != 'tensorflow':
raise RuntimeError('This example can only run with the '
'TensorFlow backend for the time being, '
'because it requires TFRecords, which '
'are not supported on other platforms.')
def images_to_tfrecord(images, labels, filename):
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
""" Save data into TFRecord """
if not os.path.isfile(filename):
num_examples = images.shape[0]
rows = images.shape[1]
cols = images.shape[2]
depth = images.shape[3]
print('Writing', filename)
writer = tf.python_io.TFRecordWriter(filename)
for index in range(num_examples):
image_raw = images[index].tostring()
example = tf.train.Example(features=tf.train.Features(feature={
'height': _int64_feature(rows),
'width': _int64_feature(cols),
'depth': _int64_feature(depth),
'label': _int64_feature(int(labels[index])),
'image_raw': _bytes_feature(image_raw)}))
writer.write(example.SerializeToString())
writer.close()
else:
print('tfrecord %s already exists' % filename)
def read_and_decode_recordinput(tf_glob, one_hot=True, classes=None, is_train=None,
batch_shape=[1000, 28, 28, 1], parallelism=1):
""" Return tensor to read from TFRecord """
print 'Creating graph for loading %s TFRecords...' % tf_glob
with tf.variable_scope("TFRecords"):
record_input = data_flow_ops.RecordInput(
tf_glob, batch_size=batch_shape[0], parallelism=parallelism)
records_op = record_input.get_yield_op()
records_op = tf.split(records_op, batch_shape[0], 0)
records_op = [tf.reshape(record, []) for record in records_op]
progbar = Progbar(len(records_op))
images = []
labels = []
for i, serialized_example in enumerate(records_op):
progbar.update(i)
with tf.variable_scope("parse_images", reuse=True):
features = tf.parse_single_example(
serialized_example,
features={
'label': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
})
img = tf.decode_raw(features['image_raw'], tf.uint8)
img.set_shape(batch_shape[1] * batch_shape[2])
img = tf.reshape(img, [1] + batch_shape[1:])
img = tf.cast(img, tf.float32) * (1. / 255) - 0.5
label = tf.cast(features['label'], tf.int32)
if one_hot and classes:
label = tf.one_hot(label, classes)
images.append(img)
labels.append(label)
images = tf.parallel_stack(images, 0)
labels = tf.parallel_stack(labels, 0)
images = tf.cast(images, tf.float32)
images = tf.reshape(images, shape=batch_shape)
# StagingArea will store tensors
# across multiple steps to
# speed up execution
images_shape = images.get_shape()
labels_shape = labels.get_shape()
copy_stage = data_flow_ops.StagingArea(
[tf.float32, tf.float32],
shapes=[images_shape, labels_shape])
copy_stage_op = copy_stage.put(
[images, labels])
staged_images, staged_labels = copy_stage.get()
return images, labels
def save_mnist_as_tfrecord():
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]
images_to_tfrecord(images=X_train, labels=y_train, filename='train.mnist.tfrecord')
images_to_tfrecord(images=X_test, labels=y_test, filename='test.mnist.tfrecord')
def cnn_layers(x_train_input):
x = Conv2D(32, (3, 3), activation='relu', padding='valid')(x_train_input)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x_train_out = Dense(classes,
activation='softmax',
name='x_train_out')(x)
return x_train_out
sess = tf.Session()
K.set_session(sess)
save_mnist_as_tfrecord()
batch_size = 100
batch_shape = [batch_size, 28, 28, 1]
epochs = 3000
classes = 10
parallelism = 10
x_train_batch, y_train_batch = read_and_decode_recordinput(
'train.mnist.tfrecord',
one_hot=True,
classes=classes,
is_train=True,
batch_shape=batch_shape,
parallelism=parallelism)
x_test_batch, y_test_batch = read_and_decode_recordinput(
'test.mnist.tfrecord',
one_hot=True,
classes=classes,
is_train=True,
batch_shape=batch_shape,
parallelism=parallelism)
x_batch_shape = x_train_batch.get_shape().as_list()
y_batch_shape = y_train_batch.get_shape().as_list()
x_train_input = Input(tensor=x_train_batch, batch_shape=x_batch_shape)
x_train_out = cnn_layers(x_train_input)
y_train_in_out = Input(tensor=y_train_batch, batch_shape=y_batch_shape, name='y_labels')
cce = categorical_crossentropy(y_train_batch, x_train_out)
train_model = Model(inputs=[x_train_input], outputs=[x_train_out])
train_model.add_loss(cce)
train_model.compile(optimizer='rmsprop',
loss=None,
metrics=['accuracy'])
train_model.summary()
tensorboard = TensorBoard()
# tensorboard disabled due to Keras bug
train_model.fit(batch_size=batch_size,
epochs=epochs) # callbacks=[tensorboard])
train_model.save_weights('saved_wt.h5')
K.clear_session()
# Second Session, pure Keras
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]
x_test_inp = Input(batch_shape=(None,) + (X_test.shape[1:]))
test_out = cnn_layers(x_test_inp)
test_model = Model(inputs=x_test_inp, outputs=test_out)
test_model.load_weights('saved_wt.h5')
test_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
test_model.summary()
loss, acc = test_model.evaluate(X_test, np_utils.to_categorical(y_test), classes)
print('\nTest accuracy: {0}'.format(acc))
I've also been working to improve the support for TFRecords in the following issue and pull request:
#6928 Yield Op support: High Performance Large Datasets via TFRecords, and RecordInput
#7102 Keras Input Tensor API Design Proposal
Finally, it is possible to use tf.contrib.learn.Experiment to train Keras models in TensorFlow.

Resources