Change Named Entity Recognition Format from ENAMEX to CoNLL - named-entity-recognition

I have a dataset which is in ENAMEX format like this:
<ENAMEX TYPE="LOCATION">Italy</ENAMEX>'s business world was rocked by the announcement <TIMEX TYPE="DATE">last Thursday</TIMEX> that Mr. <ENAMEX TYPE=„PERSON">Verdi</ENAMEX> would leave his job as vicepresident of <ENAMEX TYPE="ORGANIZATION">Music Masters of Milan, Inc</ENAMEX> to become operations director of <ENAMEX TYPE="ORGANIZATION">Arthur Andersen</ENAMEX>.
I want to change it into CoNLL format:
Italy LOCATION
's O
business O
world O
was O
rocked O
by O
the O
announcement O
last DATE
Thursday DATE
...
. O
How can I do that? Is there a standard script for such format conversion?

I wrote one myself that worked for me though is not heavily tested here:
from __future__ import unicode_literals
import os
from os import path
import re
import os
import re
import en_core_web_sm #spacy
# to convert formats such as <ENAMEX type="LOCATION">Italy</ENAMEX> is experiencing an economic boom.
def xml_iter(file_):
with open(file_, 'r') as fin:
for line in fin:
yield line.strip()
def markupline2bio(line):
#print(line.split('\t')[0])
record = line.split('\t')[0]
#print(record)
#print(parse(record))
#print(record[35:40], record[81:90])
#tags = re.findall(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record)
prev_start = 0
prev_end = 0
all_tokens = []
all_tags = []
for f in re.finditer(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record):
#print(record[f.start(0):f.end(0)], f.start(0), f.end(0))
annotations = re.findall(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record[f.start(0):f.end(0)])
before_text = record[prev_end:f.start(0)]
prev_start, prev_end = f.start(0), f.end(0)
for tok in nlp(before_text):
if str(tok).strip():
all_tokens.append(tok)
all_tags.append('O')
for phrasetag in annotations:
tag, phrase = annotations[0]
tokens = nlp(phrase)
for entity_tok_index, tok in enumerate(tokens):
if str(tok).strip():
all_tokens.append(tok)
if entity_tok_index == 0:
all_tags.append("B-" + tag)
else:
all_tags.append("I-" + tag)
else:
entity_tok_index -= 1
after_text = record[prev_end:]
for tok in nlp(after_text):
if str(tok).strip():
all_tokens.append(tok)
all_tags.append('O')
return all_tokens, all_tags
if __name__ == '__main__':
data_dir = './data/indonesian_bert_all/Indonesian/ner/'
xml_iterator = xml_iter(os.path.join(data_dir, 'data_train_ugm.txt'))
output_file = os.path.join(data_dir, 'data_train_ugm.bio')
#nlp = spacy.load("en_core_web_sm")
nlp = en_core_web_sm.load()
with open(output_file, 'w') as fout:
for i, line in enumerate(xml_iterator):
if i > 10:
#break
pass
all_tokens, all_tags = markupline2bio(line.strip())
#print(all_tokens)
#print(all_tags)
#print(line)
for tok, tag in zip(all_tokens, all_tags):
#print(tok, tag)
fout.write(str(tok) + '\t' + tag)
fout.write('\n')
fout.write('\n')

Related

convert csv to avro in python using google-cloud-dataflow beam.io.avroio.WriteToAvro(

Using google-cloud-dataflow/Cloud Composer for CSV to Avro and everything works on my local environment. When trying to read the .avsc file that contains the Avro schema from a Cloud Storage bucket, I keep getting:
IOError: [Errno 2] No such file or directory:'gs://my-bucket/xxx.avsc'
Code:
from __future__ import absolute_import
import argparse
import logging
import ntpath
import avro.schema
import apache_beam as beam
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import SetupOptions
from datetime import datetime
class RowTransformer(object):
def __init__(self, delimiter, header, filename):
self.delimiter = delimiter
self.keys = re.split(',', header)
self.filename = filename
def parse(self, row):
self.load_dt = datetime.utcnow()
split_row = row.split(self.delimiter)
#Need to cast anything that is not a string into proper type
split_row[8] = float('0' if not split_row[8] else split_row[8])
split_row[9] = float('0' if not split_row[9] else split_row[9])
split_row[10] = float('0' if not split_row[10] else split_row[10])
split_row[11] = float('0' if not split_row[11] else split_row[11])
split_row[12] = float('0' if not split_row[12] else split_row[12])
split_row[13] = float('0' if not split_row[13] else split_row[13])
split_row[14] = float('0' if not split_row[14] else split_row[14])
split_row[15] = float('0' if not split_row[15] else split_row[15])
split_row[16] = float('0' if not split_row[16] else split_row[16])
split_row[17] = float('0' if not split_row[17] else split_row[17])
split_row[18] = str('0' if not split_row[18] else split_row[18])
split_row[19] = str('0' if not split_row[19] else split_row[19])
split_row.append(self.filename)
split_row.append(self.load_dt.strftime('%Y-%m-%d %H:%M:%S.%f'))
decode_row = [i.decode('UTF-8') if isinstance(i, basestring) else i for i in split_row]
row = dict(zip(self.keys, decode_row))
return row
def run(argv=None):
"""The main function which creates the pipeline and runs it."""
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='input', required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://my-bucket/receive/xxx.txt')
parser.add_argument('--output', dest='output', required=False,
help='Output Avro to Cloud Storage',
default='gs://my-bucket/')
parser.add_argument('--schema', dest='schema', required=False,
help='Avro Schema',
default='gs://my-bucket/xxx.avsc')
parser.add_argument('--delimiter', dest='delimiter', required=False,
help='Delimiter to split input records.',
default='|')
parser.add_argument('--fields', dest='fields', required=False,
help='list of field names expected',
default='Col1,Col2...etc')
known_args, pipeline_args = parser.parse_known_args(argv)
row_transformer = RowTransformer(delimiter=known_args.delimiter,
header=known_args.fields,
filename=ntpath.basename(known_args.input))
p_opts = pipeline_options.PipelineOptions(pipeline_args)
with beam.Pipeline(options=p_opts) as pipeline:
schema_file = avro.schema.parse(open(known_args.schema, "rb").read())
rows = pipeline | "Read from text file" >> beam.io.ReadFromText(known_args.input, skip_header_lines=1)
dict_records = rows | "Convert to Avro" >> beam.Map(lambda r: row_transformer.parse(r))
dict_records | "Write to Cloud Storage as Avro" >> beam.io.avroio.WriteToAvro(known_args.output,schema=schema_file)
run()
You need to use the apache_beam.io.gcp.gcsio class instead of beam.io.ReadFromText which will only read local files, https://beam.apache.org/documentation/sdks/pydoc/2.6.0/apache_beam.io.gcp.gcsio.html

how to speed up "POS tag" with StanfordPOSTagger?

I wanted to take none phrases of tweets, code is following. The problem is that it only process 300 tweets at a time and spend 5 minutes, how to speed up?
by the way, some code edited according to text blob.
I use dataset of gate-EN-twitter(https://gate.ac.uk/wiki/twitter-postagger.html) and NLTK interface to the Stanford POS tagger to tag tweets
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import time,nltk
start_time = time.time()
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
st = StanfordPOSTagger('/models/gate-EN-twitter.model','/twitie_tagger/twitie_tag.jar', encoding='utf-8')
def _normalize_tags(chunk):
'''Normalize the corpus tags.
("NN", "NN-PL", "NNS") -> "NN"
'''
ret = []
for word, tag in chunk:
if tag == 'NP-TL' or tag == 'NP':
ret.append((word, 'NNP'))
continue
if tag.endswith('-TL'):
ret.append((word, tag[:-3]))
continue
if tag.endswith('S'):
ret.append((word, tag[:-1]))
continue
ret.append((word, tag))
return ret
def noun_phrase_count(text):
matches1=[]
print('len(text)',len(text))
for i in range(len(text)//1000):
tokenized_text = word_tokenize(text[i*1000:i*10000+1000])
classified_text = st.tag(tokenized_text)
tags = _normalize_tags(classified_text)
merge = True
while merge:
merge = False
for x in range(0, len(tags) - 1):
t1 = tags[x]
t2 = tags[x + 1]
key = t1[1], t2[1]
value = CFG.get(key, '')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = '%s %s' % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
break
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
matches1+=matches
print("--- %s seconds ---" % (time.time() - start_time))
fdist = nltk.FreqDist(matches1)
return [(tag,num) for (tag, num) in fdist.most_common()]
noun_phrase_count(tweets)
Looks like a duplicate of Stanford POS tagger with GATE twitter model is slow so you may find more info there.
Additionally; if there's any chance of stumbling upon identical inputs (tweets) twice (or more), you can consider a dictionary with the tweet (plain str) as key, and tagged as value, so that when you encounter a tweet, you first check if it's in your dict already. If not, tag it and put it there (and if this route is viable, why not pickle/unpickle that dictionary so that debugging/subsequent runs of your code go faster as well).

TF-IDF extracting keywords

Working on function somewhat like this:
def get_feature_name_by_tfidf(text_to_process):
with open(master_path + '\\additional_stopwords.txt', 'r') as f:
additional_stop_words = ast.literal_eval(f.read())
stop_words = text.ENGLISH_STOP_WORDS.union(set(additional_stop_words))
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 4), min_df=0, stop_words=stop_words)
tfidf_matrix = tf.fit_transform(text_to_process.split(','))
tagged = nltk.pos_tag(tf.get_feature_names())
feature_names_with_tags = {k: v for k, v in dict(tagged).items() if v != 'VBP'}
return list(feature_names_with_tags.keys())
Which return the list of keywords in the passed text.
Is there any way to get the keywords in the same case as it is provided?
Like passed string
Input:
a = "TIME is the company where I work"
Instead of getting keyword list as:
['time', 'company']
I like to get:
['TIME', 'company']
By default, TfidfVectorizer converts words to lowercase.Use this line:
tf = TfidfVectorizer(analyzer='word',lowercase=False, ngram_range=(1, 4), min_df=0, stop_words=stop_words)
and it should work. Use this link for ref. TfidfVectorizer

Write Twitter Frequency analysis to a CSV using python

How do I write the output of my code to a csv?
Here is what I'm trying, the frequency analysis works, but I can't get the csv to write. Pretty new to python, so I am sure that I am doing something wrong.
# This Python file uses the following encoding: utf-8
import os, sys
import re
import csv
filename = 'TweetsCSV_ORIGINAL.txt'
word_list = re.split('\s+', file(filename).read().lower())
print 'Words in text:', len(word_list)
freq_dic = {}
punctuation = re.compile(r'[.?!,":;]')
for word in word_list:
word = punctuation.sub("", word)
try:
freq_dic[word] += 1
except:
freq_dic[word] = 1
print 'Unique words:', len(freq_dic)
freq_list = freq_dic.items()
freq_list.sort()
for word, freq in freq_list:
print word, freq
#write to CSV
res = [word, freq]
csvfile = "tweetfreq.csv"
#Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in res:
writer.writerow([val])
This snippet will append a line to the end of your CSV file.
with open('tweetfreq.csv', 'a') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([word,freq])

Reading the fileset from a torrent

I want to (quickly) put a program/script together to read the fileset from a .torrent file. I want to then use that set to delete any files from a specific directory that do not belong to the torrent.
Any recommendations on a handy library for reading this index from the .torrent file? Whilst I don't object to it, I don't want to be digging deep into the bittorrent spec and rolling a load of code from scratch for this simple purpose.
I have no preference on language.
I would use rasterbar's libtorrent which is a small and fast C++ library.
To iterate over the files you could use the torrent_info class (begin_files(), end_files()).
There's also a python interface for libtorrent:
import libtorrent
info = libtorrent.torrent_info('test.torrent')
for f in info.files():
print "%s - %s" % (f.path, f.size)
Effbot has your question answered. Here is the complete code to read the list of files from .torrent file (Python 2.4+):
import re
def tokenize(text, match=re.compile("([idel])|(\d+):|(-?\d+)").match):
i = 0
while i < len(text):
m = match(text, i)
s = m.group(m.lastindex)
i = m.end()
if m.lastindex == 2:
yield "s"
yield text[i:i+int(s)]
i = i + int(s)
else:
yield s
def decode_item(next, token):
if token == "i":
# integer: "i" value "e"
data = int(next())
if next() != "e":
raise ValueError
elif token == "s":
# string: "s" value (virtual tokens)
data = next()
elif token == "l" or token == "d":
# container: "l" (or "d") values "e"
data = []
tok = next()
while tok != "e":
data.append(decode_item(next, tok))
tok = next()
if token == "d":
data = dict(zip(data[0::2], data[1::2]))
else:
raise ValueError
return data
def decode(text):
try:
src = tokenize(text)
data = decode_item(src.next, src.next())
for token in src: # look for more tokens
raise SyntaxError("trailing junk")
except (AttributeError, ValueError, StopIteration):
raise SyntaxError("syntax error")
return data
if __name__ == "__main__":
data = open("test.torrent", "rb").read()
torrent = decode(data)
for file in torrent["info"]["files"]:
print "%r - %d bytes" % ("/".join(file["path"]), file["length"])
Here's the code from Constantine's answer above, slightly modified to handle Unicode characters in torrent filenames and fileset filenames in torrent info:
import re
def tokenize(text, match=re.compile("([idel])|(\d+):|(-?\d+)").match):
i = 0
while i < len(text):
m = match(text, i)
s = m.group(m.lastindex)
i = m.end()
if m.lastindex == 2:
yield "s"
yield text[i:i+int(s)]
i = i + int(s)
else:
yield s
def decode_item(next, token):
if token == "i":
# integer: "i" value "e"
data = int(next())
if next() != "e":
raise ValueError
elif token == "s":
# string: "s" value (virtual tokens)
data = next()
elif token == "l" or token == "d":
# container: "l" (or "d") values "e"
data = []
tok = next()
while tok != "e":
data.append(decode_item(next, tok))
tok = next()
if token == "d":
data = dict(zip(data[0::2], data[1::2]))
else:
raise ValueError
return data
def decode(text):
try:
src = tokenize(text)
data = decode_item(src.next, src.next())
for token in src: # look for more tokens
raise SyntaxError("trailing junk")
except (AttributeError, ValueError, StopIteration):
raise SyntaxError("syntax error")
return data
n = 0
if __name__ == "__main__":
data = open("C:\\Torrents\\test.torrent", "rb").read()
torrent = decode(data)
for file in torrent["info"]["files"]:
n = n + 1
filenamepath = file["path"]
print str(n) + " -- " + ', '.join(map(str, filenamepath))
fname = ', '.join(map(str, filenamepath))
print fname + " -- " + str(file["length"])
bencode.py from the original Mainline BitTorrent 5.x client (http://download.bittorrent.com/dl/BitTorrent-5.2.2.tar.gz) would give you pretty much the reference implementation in Python.
It has an import dependency on the BTL package but that's trivially easy to remove. You'd then look at bencode.bdecode(filecontent)['info']['files'].
Expanding on the ideas above, I did the following:
~> cd ~/bin
~/bin> ls torrent*
torrent-parse.py torrent-parse.sh
~/bin> cat torrent-parse.py
# torrent-parse.py
import sys
import libtorrent
# get the input torrent file
if (len(sys.argv) > 1):
torrent = sys.argv[1]
else:
print "Missing param: torrent filename"
sys.exit()
# get names of files in the torrent file
info = libtorrent.torrent_info(torrent);
for f in info.files():
print "%s - %s" % (f.path, f.size)
~/bin> cat torrent-parse.sh
#!/bin/bash
if [ $# -lt 1 ]; then
echo "Missing param: torrent filename"
exit 0
fi
python torrent-parse.py "$*"
You'll want to set permissions appropriately to make the shell script executable:
~/bin> chmod a+x torrent-parse.sh
Hope this helps someone :)

Resources