I have been working on a legal data to recognize the custom entities within a legal document. I am training empty spacy model on my cleaned training dataset(JSON format as mentioned on spacy documentation). I have tried to remove punctuation, special character, brackets etc from the training dataset. I have labeled the new entity by name 'dictkey' and mentioned the startindex and endindex in it's JSON in order to create the training dataset.
Below is the links for training dataset and the code which i am using. Can you please have a look on the attached training dataset if there is any issue or further cleaning required for this dataset?
https://techmailer.online/TRAIN_DATA3json.txt
https://techmailer.online/train_ner%20-%20Copy.py
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import re
#import createtrainingdataset_updated_v2 as traindataset
# training data
#TRAIN_DATA = [
# ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
# ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
#]
with open('/Users/modis1/Desktop/24-01/TRAIN_DATA3json.txt', 'r', encoding='utf-8') as openfile:
TRAIN_DATA = openfile.read()
#plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir='/Users/A-GUPTA50/Desktop/ShubhamModi/NewSavedModel/', n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER
# reset and initialize the weights randomly – but only if we're
# training a new model
if model is None:
nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
# test the trained model
for text, _ in TRAIN_DATA:
# text = re.sub(r'\\u\d{4,}','', text.rstrip())
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
doc = nlp2(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == "__main__":
plac.call(main)
Related
I'm trying to use this model on huggingface for QA. The code for it is in the link:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/roberta-base-squad2"
# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
'question': 'Why is model conversion important?',
'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)
# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(res)
>>>
{'score': 0.2117144614458084,
'start': 59,
'end': 84,
'answer': 'gives freedom to the user'}
However, I can't figure out how to get a loss so I can finetune this model. I was looking at the huggingface tutorial but didn't see anything other than using the Trainer method or another training method in the link (which is not QA):
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
"I've been waiting for a HuggingFace course my whole life.",
"This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
# This is new
batch["labels"] = torch.tensor([1, 1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()
Say that the true answer is freedom to the user instead of gives freedom to the user
You do not have to get the loss. There is the Trainer class in Hugginface and you can use it to train your model. It is also optimized for hugginface models and contains lots of different kind of deep learning best practices that you might be interested in. See here: https://huggingface.co/docs/transformers/main_classes/trainer
I was trying to deploy my custom DecisionTreeRegressor for house price prediction to GCS Vertex AI. The tutorial I followed was tutorial for MPG dataset tutorial
However, when I tried to build and test the container locally using commands:
docker build ./ -t $IMAGE_URI
docker run $IMAGE_URI
The error message came out:
AttributeError: 'DecisionTreeRegressor' object has no attribute 'save'
The code I run as train.py:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
# Load the Boston housing dataset
data = pd.read_csv('trainer/housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
# Import 'train_test_split'
from sklearn.model_selection import train_test_split
# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state = 42)
#Defining model fitting and tuning functions
# Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score # Import 'r2_score'
from sklearn.metrics import accuracy_score
# TODO: replace `your-gcs-bucket` with the name of the Storage bucket you created earlier
BUCKET = 'gs://gardena-dps-bucket'
def performance_metric(y_true, y_predict):
""" Calculates and returns the performance score between
true (y_true) and predicted (y_predict) values based on the metric chosen. """
score = r2_score(y_true, y_predict)
# Return the score
return score
def fit_model(X, y):
""" Performs grid search over the 'max_depth' parameter for a
decision tree regressor trained on the input data [X, y]. """
# Create cross-validation sets from the training data
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)
# Create a decision tree regressor object
regressor = DecisionTreeRegressor()
# Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}
# Transform 'performance_metric' into a scoring function using 'make_scorer'
scoring_fnc = make_scorer(performance_metric)
# Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
# Fit the grid search object to the data to compute the optimal model
grid = grid.fit(X, y)
# Return the optimal model after fitting the data
return grid.best_estimator_
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)
# Produce a matrix for client data
client_data = [[12, 26.3, 16.99885]] # Client data in 2D array
# Show predictions
reprice = reg.predict(client_data).astype(int)
reprice
# Export model and save to GCS
reg.save(BUCKET + '/housing/model')
Scikit-learn estimators do not provide any method to save their states directly. From the Google documentation, the best way to store a fitted model to GCS is to use joblib to locally serialize your model and then upload it to GCS.
As follow:
from google.cloud import storage
from sklearn.externals import joblib
# Export the model to a file
model = 'model.joblib'
joblib.dump(pipeline, model)
# Upload the model to GCS
bucket = storage.Client().bucket(BUCKET_NAME)
blob = bucket.blob('{}/{}'.format(
datetime.datetime.now().strftime('model_%Y%m%d_%H%M%S'),
model))
blob.upload_from_filename(model)
I wonder what is the right way to use torchtext for inference.
Let's assume I've trained the model and dump all Fields with built vocabularies. It seems the next step is to use torchtext.data.Example to load one single example. Somehow I should numeralize it by using loaded Fields and create an Iterator.
I would appreciate any simple examples of using torchtext for inference.
For a trained model and vocabulary (which is part of the text field , you don't have to save the whole class) :
def read_vocab(path):
#read vocabulary pkl
import pickle
pkl_file = open(path, 'rb')
vocab = pickle.load(pkl_file)
pkl_file.close()
return vocab
def load_model_and_vocab():
import torch
import os.path
my_path = os.path.abspath(os.path.dirname(__file__))
vocab_path = os.path.join(my_path, vocab_file)
weights_path = os.path.join(my_path, WEIGHTS)
vocab = read_vocab(vocab_path)
model = classifier(vocab_size=len(vocab))
model.load_state_dict(torch.load(weights_path))
model.eval()
return model, vocab
def predict(model, vocab, sentence):
tokenized = [w.text.lower() for w in nlp(sentence)] # tokenize the sentence
indexed = [vocab.stoi[t] for t in tokenized] # convert to integer sequence
length = [len(indexed)] # compute no. of words
tensor = torch.LongTensor(indexed).to('cpu') # convert to tensor
tensor = tensor.unsqueeze(1).T # reshape in form of batch,no. of words
length_tensor = torch.LongTensor(length) # convert to tensor
prediction = model(tensor, length_tensor) # prediction
return round(1-prediction.item())
"classifier" is the class I defined for my model.
For saving the vocabulary pkl :
def save_vocab(vocab):
import pickle
output = open('vocab.pkl', 'wb')
pickle.dump(vocab, output)
output.close()
And for saving the model after training you can use :
torch.save(model.state_dict(), 'saved_weights.pt')
Tell me if it worked for you!
This is my code, how can I convert the model into a node rest api. I have created the traning set and saved the model. Can anyone help me with the api part I have tried but was not successful.
training = []
output = []
create an empty array for our output
output_empty = [0] * len(classes)
training set, bag of words for each sentence
for doc in documents:
# initialize our bag of words
bag = []
# list of tokenized words for the pattern
pattern_words = doc[0]
# stem each word
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
# create our bag of words array
for w in words:
bag.append(1) if w in pattern_words else bag.append(0)
# output is a '0' for each tag and '1' for current tag
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
training.append([bag, output_row])
# shuffle our features and turn into np.array
random.shuffle(training)training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)
# training set, bag of words for each sentence
for doc in documents:
# initialize our bag of words
bag = []
# list of tokenized words for the pattern
pattern_words = doc[0]
# stem each word
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
# create our bag of words array
for w in words:
bag.append(1) if w in pattern_words else bag.append(0)
# output is a '0' for each tag and '1' for current tag
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
training.append([bag, output_row])
# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)
# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])
training = np.array(training)
# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)
Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=4000, batch_size=8, show_metric=True)
saving the model
model.save('model.tflearn')
# save all of our data structures
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data", "wb" ) )
import pickle
data = pickle.load( open( "training_data", "rb" ) )
words = data['words']
classes = data['classes']
train_x = data['train_x']
train_y = data['train_y']
# import our chat-bot intents file
import json
with open('D:\\android\\ad.json') as json_data:
intents = json.load(json_data)
def clean_up_sentence(sentence):
# tokenize the pattern
sentence_words = nltk.word_tokenize(sentence)
# stem each word
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
return sentence_words
# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
# tokenize the pattern
sentence_words = clean_up_sentence(sentence)
# bag of words
bag = [0]*len(words)
for s in sentence_words:
for i,w in enumerate(words):
if w == s:
bag[i] = 1
if show_details:
print ("found in bag: %s" % w)
return(np.array(bag))
ERROR_THRESHOLD = 0.25
classifying the inputs
def classify(sentence):
# generate probabilities from the model
results = model.predict([bow(sentence, words)])[0]
# filter out predictions below a threshold
results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
# sort by strength of probability
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append((classes[r[0]], r[1]))
# return tuple of intent and probability
return return_list
def response(sentence, userID='123', show_details=False):
results = classify(sentence)
# if we have a classification then find the matching intent tag
if results:
# loop as long as there are matches to process
while results:
for i in intents['intents']:
# find a tag matching the first result
if i['tag'] == results[0][0]:
# a random response from the intent
return print(random.choice(i['response']))
There are multiple ways you can do this. You can use Server frameworks like Flask or Django. I will be presenting a simple example using flask : (note this is just an abstract prototype)
Create a model class
import libraries
class Model ():
def __init__(self):
self.model = load()
def inference(self, inpts):
return self.model.predict(inputs)
Note, this is just a prototype, the functions are implemented by you.
Create a REST endpoint
from flask import Flask, request, jsonify
from model import Model
app = Flask("__main__")
model = Model()
#app.route("/inference", methods =["POST"])
def inference():
data = request.get_json()
results = model.inference(data["inputs"])
return jsonify(
{"result" : results }
)
Then you can use curl to test the endpoint, and you can alse axios or fetch to send post request to the endpoint. don't forget to add cors if you are trying on same domains.
Thank you
According to Tflearn documentation, the library remains compatible with Tensorflow.
Google released Tensorflow JS which works both as browser based and NodeJS Javascript libraries.
Tensorflow models can be loaded to Tensorflow.JS as described in the link:
https://js.tensorflow.org/tutorials/import-saved-model.html
For the reference; model needs to be converted to TF.JS format
-You need to install Tensorflow.JS to your Python environment first:
pip install tensorflowjs
-Convert an existing TensorFlow model to TensorFlow.js Web format
tensorflowjs_converter \
--input_format=tf_saved_model \
--output_node_names='Some/Model/Name' \
--saved_model_tags=serve \
/my/saved_model \
/my/web_model
Load the saved model in NodeJS environment:
const model = await tf.loadModel('file:///mypath/mymodel.json');
If I get the optimal parameters using GridSearchCV and a pipeline, is there anyway to save the trained model, so in the future I can call the entire pipeline onto new data and generate a prediction for it? For example, I have the following pipeline followed by a gridsearchcv of the parameters:
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(SVC(probability=True))),
])
parameters = {
'vect__ngram_range': ((1, 1),(1, 2),(1,3)), # unigrams or bigrams
'clf__estimator__kernel': ('rbf','linear'),
'clf__estimator__C': tuple([10**i for i in range(-10,11)]),
}
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
#Conduct the grid search
grid_search.fit(X,y)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
#Obtain the top performing parameters
best_parameters = grid_search.best_estimator_.get_params()
#Print the results
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Now I want to save all of these steps into a single flow so that I can apply it to a new, unseen dataset and it will use the same parameters, vectorizers and transformers to transform, implement and report back the results on it?
You can just pickle the GridSearchCV object to save it and then unpickle it when you want to use it to predict new data.
import pickle
# Fit model and pickle fitted model
grid_search.fit(X,y)
with open('/model/path/model_pickle_file', "w") as fp:
pickle.dump(grid_search, fp)
# Load model from file
with open('/model/path/model_pickle_file', "r") as fp:
grid_search_load = pickle.load(fp)
# Predict new data with model loaded from disk
y_new = grid_search_load.best_estimator_.predict(X_new)