I'm trying to match all e-mail like looking text in a bunch of documents and add it to custom NER label called 'EMAIL'.
Here is the code for a test case.
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john#ymail.com and an alternate is john#gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
Here's what I get when I run this code.
Traceback (most recent call last):
File "C:/Python27/emailpii.py", line 26, in <module>
matches = matcher(doc)
File "matcher.pyx", line 407, in spacy.matcher.Matcher.__call__
File "C:/Python27/emailpii.py", line 19, in add_event_ent
doc.ents += ((EMAIL, start, end),)
File "doc.pyx", line 415, in spacy.tokens.doc.Doc.ents.__get__
File "span.pyx", line 61, in spacy.tokens.span.Span.__cinit__
AssertionError: 17587345535198158200
However, on running a similar example
import spacy
print "*****************"
print(spacy.__version__)
print "*****************"
from spacy.matcher import Matcher
#from spacy import displacy
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EVENT = nlp.vocab.strings['EVENT']
def add_event_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EVENT, start, end),)
matcher.add('GoogleIO', add_event_ent,
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}],
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}, {'IS_DIGIT': True}])
text = u"Google I/O was great this year. See you all again in Google I/O 2018"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
#displacy.serve(doc, style = 'ent')
I get the output as desired:
2.0.1
(0, Google I/O)
(1, Google I/O)
(2, Google I/O 2018)
(u'Google I/O', u'EVENT')
(u'this year', u'DATE')
(u'Google I/O 2018', u'EVENT')
Am I missing something here?
I believe your first code fails because you have not added an Entity label for 'EMAIL'. The second code works because EVENT is a pre-existing Entity type.
The documentation is not very clear on what the first argument of the matcher.add() method actually does, but it adds an Entity label for you. Here are two alternatives that should work and clear up the confusion:
Alternative 1:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
#EMAIL = nlp.vocab.strings['EMAIL'] #Not needed
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((match_id, start, end),)
matcher.add('EMAIL', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john#ymail.com and an alternate is john#gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
Alternative 2 (I'm not sure why you'd want to do it this way because you end up with two entity labels serving essentially the same purpose, but provided just for illustration purposes):
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRecognizer
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
ner = EntityRecognizer(nlp.vocab)
ner.add_label('EMAIL')
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john#ymail.com and an alternate is john#gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
Related
I am trying to implement a custom aggregation using TFF by changing the code from this tutorial . I would like to rewrite next_fn so that all the client weights are placed at the server for further computations. As federated_collect was removed from tff-nightly, I am trying to do that using federated_aggregate.
This is what I have so far:
def accumulate(x, y):
x.append(y)
return x
def merge(x, y):
x.extend(y)
return y
#tff.federated_computation(federated_server_type, federated_dataset_type)
def next_fn(server_state, federated_dataset):
server_weights_at_client = tff.federated_broadcast(
server_state.trainable_weights)
client_deltas = tff.federated_map(
client_update_fn, (federated_dataset, server_weights_at_client))
z = []
agg_result = tff.federated_aggregate(client_deltas, z,
accumulate=tff.tf_computation(accumulate),
merge=tff.tf_computation(merge),
report=tff.tf_computation(lambda x: x))
new_weights = do_smth_with_result(agg_result)
server_state = tff.federated_map(
server_update_fn, (server_state, new_weights))
return server_state
However this results in the following Exception:
File "/home/yana/Documents/Uni/Thesis/grufedatt_try.py", line 351, in <module>
def next_fn(server_state, federated_dataset):
File "/home/yana/anaconda3/envs/fedenv/lib/python3.9/site-packages/tensorflow_federated/python/core/impl/wrappers/computation_wrapper.py", line 494, in __call__
wrapped_func = self._strategy(
File "/home/yana/anaconda3/envs/fedenv/lib/python3.9/site-packages/tensorflow_federated/python/core/impl/wrappers/computation_wrapper.py", line 222, in __call__
result = fn_to_wrap(*args, **kwargs)
File "/home/yana/Documents/Uni/Thesis/grufedatt_try.py", line 358, in next_fn
agg_result = tff.federated_aggregate(client_deltas, z,
File "/home/yana/anaconda3/envs/fedenv/lib/python3.9/site-packages/tensorflow_federated/python/core/impl/federated_context/intrinsics.py", line 140, in federated_aggregate
raise TypeError(
TypeError: Expected parameter `accumulate` to be of type (<<<float32[9999,96],float32[96,1024],float32[256,1024],float32[1024],float32[256,96],float32[96]>>,<float32[9999,96],float32[96,1024],float32[256,1024],float32[1024],float32[256,96],float32[96]>> -> <<float32[9999,96],float32[96,1024],float32[256,1024],float32[1024],float32[256,96],float32[96]>>), but received (<<>,<float32[9999,96],float32[96,1024],float32[256,1024],float32[1024],float32[256,96],float32[96]>> -> <<float32[9999,96],float32[96,1024],float32[256,1024],float32[1024],float32[256,96],float32[96]>>) instead.
Try using tff.aggregators.federated_sample with max_num_samples being equal to the number of clients you have.
That should be a simple drop-in replacement for how you would previously use tff.federated_collect.
In your accumulate, the issue is that you are changing number of tensors the accumulator would contain, so you get an error when accumulating more than a single accumuland. If you would want to go this way though, for a rank-1 accumuland with k elements, you could probably do something like the following instead:
#tff.tf_computation(tff.types.TensorType(tf.float32, [None, k]),
tff.types.TensorType(tf.float32, [k]))
def accumulate(accumulator, accumuland):
return tf.concat([accumulator, tf.expand_dims(accumuland, axis=0)], axis=0)
I wanted to take none phrases of tweets, code is following. The problem is that it only process 300 tweets at a time and spend 5 minutes, how to speed up?
by the way, some code edited according to text blob.
I use dataset of gate-EN-twitter(https://gate.ac.uk/wiki/twitter-postagger.html) and NLTK interface to the Stanford POS tagger to tag tweets
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import time,nltk
start_time = time.time()
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
st = StanfordPOSTagger('/models/gate-EN-twitter.model','/twitie_tagger/twitie_tag.jar', encoding='utf-8')
def _normalize_tags(chunk):
'''Normalize the corpus tags.
("NN", "NN-PL", "NNS") -> "NN"
'''
ret = []
for word, tag in chunk:
if tag == 'NP-TL' or tag == 'NP':
ret.append((word, 'NNP'))
continue
if tag.endswith('-TL'):
ret.append((word, tag[:-3]))
continue
if tag.endswith('S'):
ret.append((word, tag[:-1]))
continue
ret.append((word, tag))
return ret
def noun_phrase_count(text):
matches1=[]
print('len(text)',len(text))
for i in range(len(text)//1000):
tokenized_text = word_tokenize(text[i*1000:i*10000+1000])
classified_text = st.tag(tokenized_text)
tags = _normalize_tags(classified_text)
merge = True
while merge:
merge = False
for x in range(0, len(tags) - 1):
t1 = tags[x]
t2 = tags[x + 1]
key = t1[1], t2[1]
value = CFG.get(key, '')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = '%s %s' % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
break
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
matches1+=matches
print("--- %s seconds ---" % (time.time() - start_time))
fdist = nltk.FreqDist(matches1)
return [(tag,num) for (tag, num) in fdist.most_common()]
noun_phrase_count(tweets)
Looks like a duplicate of Stanford POS tagger with GATE twitter model is slow so you may find more info there.
Additionally; if there's any chance of stumbling upon identical inputs (tweets) twice (or more), you can consider a dictionary with the tweet (plain str) as key, and tagged as value, so that when you encounter a tweet, you first check if it's in your dict already. If not, tag it and put it there (and if this route is viable, why not pickle/unpickle that dictionary so that debugging/subsequent runs of your code go faster as well).
Working on function somewhat like this:
def get_feature_name_by_tfidf(text_to_process):
with open(master_path + '\\additional_stopwords.txt', 'r') as f:
additional_stop_words = ast.literal_eval(f.read())
stop_words = text.ENGLISH_STOP_WORDS.union(set(additional_stop_words))
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 4), min_df=0, stop_words=stop_words)
tfidf_matrix = tf.fit_transform(text_to_process.split(','))
tagged = nltk.pos_tag(tf.get_feature_names())
feature_names_with_tags = {k: v for k, v in dict(tagged).items() if v != 'VBP'}
return list(feature_names_with_tags.keys())
Which return the list of keywords in the passed text.
Is there any way to get the keywords in the same case as it is provided?
Like passed string
Input:
a = "TIME is the company where I work"
Instead of getting keyword list as:
['time', 'company']
I like to get:
['TIME', 'company']
By default, TfidfVectorizer converts words to lowercase.Use this line:
tf = TfidfVectorizer(analyzer='word',lowercase=False, ngram_range=(1, 4), min_df=0, stop_words=stop_words)
and it should work. Use this link for ref. TfidfVectorizer
How do I write the output of my code to a csv?
Here is what I'm trying, the frequency analysis works, but I can't get the csv to write. Pretty new to python, so I am sure that I am doing something wrong.
# This Python file uses the following encoding: utf-8
import os, sys
import re
import csv
filename = 'TweetsCSV_ORIGINAL.txt'
word_list = re.split('\s+', file(filename).read().lower())
print 'Words in text:', len(word_list)
freq_dic = {}
punctuation = re.compile(r'[.?!,":;]')
for word in word_list:
word = punctuation.sub("", word)
try:
freq_dic[word] += 1
except:
freq_dic[word] = 1
print 'Unique words:', len(freq_dic)
freq_list = freq_dic.items()
freq_list.sort()
for word, freq in freq_list:
print word, freq
#write to CSV
res = [word, freq]
csvfile = "tweetfreq.csv"
#Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in res:
writer.writerow([val])
This snippet will append a line to the end of your CSV file.
with open('tweetfreq.csv', 'a') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([word,freq])
I am trying to take a list of words that I have imported from a textfile and make a dictionary , where the value is incremented each time the word is passed over in the loop. However, with the current code I have, none are added and only the value I add initiall is there when I print the dictionary. What am I doing wrong?
import pymysql
from os import path
import re
db = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='db_cc')
cursor = db.cursor()
cursor.execute("SELECT id, needsprocessing, SchoolID, ClassID, TaskID FROM sharedata WHERE needsprocessing = 1")
r = cursor.fetchall()
print(r)
from os import path
import re
noentities = len(r)
a = r[0][1]
b = r[0][2]
c = r[0][3]
d = r[0][4]
filepath = "/codecompare/%s/%s/%s/%s.txt" %(a, b, c, d)
print(filepath)
foo = open(filepath, "r")
steve = foo.read()
rawimport = steve.split(' ')
dictionary = {"for":0}
foo.close()
for word in rawimport:
if word in dictionary:
dictionary[word] +=1
else:
dictionary[word] = 1
print dictionary
Some rawimport values are as follows:
print rawimport
['Someting', 'something', 'dangerzones', 'omething', 'ghg', 'sdf', 'hgiinsfg', '932wrtioarsjg', 'fghbyghgyug', 'sadiiilglj']
Additionally, when trying to print from the code, it throws
... print dictionary
File "<stdin>", line 3
print dictionary
^
SyntaxError: invalid syntax
However, if I run print dictionary by itself it prints:
{'for': 0}
Which is evidence that for loop did nothing.
Any ideas?
Running Python 2.7.2
edit: updated to reflect closing of file and to make loop simpler
edit: added sample rawimport data
I received the same Traceback when working through this in the Python interpreter -- it arose from not leaving the context of the for loop:
>>> for word in rawimport:
... if word in dictionary:
... dictionary[word]+=1
... else:
... dictionary[word]=1
... print dictionary
File "<stdin>", line 6
print dictionary
^
The interpreter thinks your print statement belongs to the for loop, and errors because it's not appropriately indented. (If you did indent it, of course, it would print the dictionary during each pass). The solution to that (assuming you're doing this in the interpreter, which was how I reproduced your error) is hitting enter again:
>>> for word in rawimport:
... if word in dictionary:
... dictionary[word]+=1
... else:
... dictionary[word]=1
...
>>> print dictionary
{'for': 1, 'fghbyghgyug': 1, '932wrtioarsjg': 1, 'dangerzones': 1, 'sdf': 1, 'ghg': 1, 'Someting': 1, 'something': 1, 'omething': 1, 'sadiiilglj': 1, 'hgiinsfg': 1}
'''