How to add commas to Python Dictionaries output from beam's ReadFromBigQuery in Python? - google-cloud-dataflow

I am trying to run a Dataflow pipeline in Python (3.9) via a FlexTemplate that runs a query in BigQuery (non-nested/non-repeated records) and writes the data to some database with beam_nuggets. My test database is Postgres. The data from BigQuery yields Python dictionaries as expected, but when I try to upload this to the database the pipeline fails.
Test data written to GCS from BigQuery.
{'order_id': 'CM-2011-110', 'order_date': '4/10/2011', 'ship_date': '10/10/2011', 'ship_mode': 'Standard Class', 'customer_name': 'Alejandro Grove', 'segment': 'Consumer', 'state': 'Est', 'country': 'Cameroon', 'market': 'Africa', 'region': 'Africa', 'product_id': 'OFF-CAR-10002031', 'category': 'Office Supplies', 'sub_category': 'Binders', 'product_name': 'Cardinal 3-Hole Punch, Durable', 'sales': 30, 'quantity': 1, 'discount': 0.0, 'profit': 13.92, 'shipping_cost': 2.57, 'order_priority': 'Medium', 'year': 2011}
{'order_id': 'CM-2011-110', 'order_date': '4/10/2011', 'ship_date': '10/10/2011', 'ship_mode': 'Standard Class', 'customer_name': 'Alejandro Grove', 'segment': 'Consumer', 'state': 'Est', 'country': 'Cameroon', 'market': 'Africa', 'region': 'Africa', 'product_id': 'TEC-CAN-10002879', 'category': 'Technology', 'sub_category': 'Copiers', 'product_name': 'Canon Copy Machine, High-Speed', 'sales': 521, 'quantity': 2, 'discount': 0.0, 'profit': 93.72, 'shipping_cost': 30.83, 'order_priority': 'Medium', 'year': 2011}
Current Code:
def run(save_main_session=True):
beam_options = PipelineOptions()
args = beam_options.view_as(MyOptions)
with beam.Pipeline(options=beam_options) as p:
db_source = db_reader(args.bqprojectid, args.dataset, args.table, args.limit)
db = db_writer(args.destinationip, args.port, args.destinationusername, args.destinationpassword, args.destinationtable, args.database_name)
result = (
p | beam.io.ReadFromBigQuery(use_standard_sql=True, query=db_source.sql_query()
)
# I would like to remove this step, but it helps with debugging.
|'Write to GCS' >> WriteToText('SOMEURI')
|'Write to Database' >> relational_db.Write(
source_config = (db.sink_config()),
table_config = (db.table_config())
))
I went through this answer: Write BigQuery results to GCS in CSV format using Apache Beam but was wondering if there was an easier way to transform the output from BigQuery to mirror this structure:
{'name': 'Jan', 'num': 1},
{'name': 'Feb', 'num': 2},
I was considering regex, but I do not thing that would be a best practice.
Please let me know if there is anything I can clarify.
Have a great day!
Error Message
File "/usr/local/lib/python3.9/site-packages/beam_nuggets/io/relational_db.py", line 181, in process
assert isinstance(element, dict)
I checked the credentials and realized that the structure of the data is the issue by creating a static p-collection via the beam nuggets docs. This uploaded successfully (months).
months = p | "Reading month records" >> beam.Create([
{'name': 'Jan', 'num': 1},
{'name': 'Feb', 'num': 2},
])
Resolution via answer provided below.
def map_to_beam_nuggets_data(element):
return {
'order_id': element['order_id'],
'order_date': element['order_date'],
'ship_date': element['ship_date'],
'ship_mode': element['ship_mode'],
'customer_name': element['customer_name'],
'segment': element['segment'],
'state': element['state'],
'country': element['country'],
'market': element['market'],
'region': element['region'],
'product_id': element['product_id'],
'category': element['category'],
'sub_category': element['sub_category'],
'product_name': element['product_name'],
'sales': element['sales'],
'quantity': element['quantity'],
'discount': element['discount'],
'profit': element['profit'],
'shipping_cost': element['shipping_cost'],
'order_priority': element['order_priority'],
}
# this is the controller function
def run(save_main_session=True):
beam_options = PipelineOptions()
args = beam_options.view_as(MyOptions)
with beam.Pipeline(options=beam_options) as p:
db_source = db_reader(args.bqprojectid, args.dataset, args.table, args.limit)
db = db_writer(args.destinationip, args.port, args.destinationusername, args.destinationpassword, args.destinationtable, args.database_name)
# Result from BigQuery.
result = (p | beam.io.ReadFromBigQuery(use_standard_sql=True, query=db_source.sql_query()))
# Sink result to GCS.
result | 'Write to GCS' >> WriteToText('SOMEURI')
# Map to result to Beam Nuggets data and Sink result to the database.
(result
| 'Map to beam nuggets data' >> beam.Map(map_to_beam_nuggets_data)
| 'Write to Database' >> relational_db.Write(
source_config=(db.sink_config()),
table_config=(db.table_config())
))
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()

I hope it can help.
In this case, you can use multi sink output :
Result PCollection from BigQuery
Sink the result to GCS
Transform the result to the expected output for beam nuggets data and sink it to the database
def run_pipeline(save_main_session=True):
beam_options = PipelineOptions()
args = beam_options.view_as(MyOptions)
with Pipeline(options=beam_options) as p:
db_source = db_reader(args.bqprojectid, args.dataset, args.table, args.limit)
db = db_writer(args.destinationip, args.port, args.destinationusername, args.destinationpassword,
args.destinationtable, args.database_name)
# Result from BigQuery.
result = (p | beam.io.ReadFromBigQuery(use_standard_sql=True, query=db_source.sql_query()))
# Sink result to GCS.
result | 'Write to GCS' >> WriteToText('SOMEURI')
# Map to result to Beam Nuggets data and Sink result to the database.
(result
| 'Map to beam nuggets data' >> beam.Map(map_to_beam_nuggets_data)
| 'Write to Database' >> relational_db.Write(
source_config=(db.sink_config()),
table_config=(db.table_config())
)
)
def map_to_beam_nuggets_data(element: Dict) -> Dict:
return {
'name': element['name'],
'num': element['num'],
}
To simulate your input data from BigQuery, I created an unit test with a mock and pytest. The ReadFromBigQuery source should returns a PCollection of Dict :
import apache_beam as beam
import pytest
from apache_beam.testing.test_pipeline import TestPipeline
def test_pipeline(self):
mock = [
{
'name': 'toto',
'num': 'num',
'other_field': 'other'
}
]
with TestPipeline() as p:
result = (
p
| beam.Create(mock)
| 'Map to Beam nuggets data' >> beam.Map(self.map_to_beam_nuggets_data)
)
result | "Print outputs" >> beam.Map(print)
def map_to_beam_nuggets_data(self, element: Dict) -> Dict:
return {
'name': element['name'],
'num': element['num'],
}
The result is :
{'name': 'toto', 'num': 'num'}

Related

write date from yfinance Timestamp to influxdb and query the date - timezone support

I am trying to write date/time data to influxdb and query the data to a dataframe.
when I write the data date time looks like this...
ticker= 'AAPL'
import yfinance as yf
df = yf.Ticker('AAPL').history(period="1d").index[0]
print(df)
output:
Timestamp('2023-01-05 00:00:00-0500', tz='America/New_York')
...and when i query the data to a dataframe and print it I get this:
df['_time']
output:
0 2023-01-05 05:00:00+00:00
Name: _time, dtype: datetime64[ns, tzutc()]
What do I need to do to properly write the time in influxdb?
please see full code below:
########## WRITE ##########
import yfinance as yf
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS, PointSettings
token = "my-token"
org = "my-org"
url = "my-url"
bucket = "stocks_us"
retention_policy = "autogen"
client = influxdb_client.InfluxDBClient(url=url, token=token, org=org)
write_api = client.write_api(write_options=SYNCHRONOUS)
df = yf.Ticker('AAPL').history(period="1d")
with client:
"""
Ingest DataFrame with default tags
"""
point_settings = PointSettings(**{"ticker": ticker})
write_api = client.write_api(write_options=SYNCHRONOUS,
point_settings=point_settings)
write_api.write(bucket=bucket,
org= "dev",
record=df,
data_frame_measurement_name="stock_daily_df")
client.close()
print(df)
and
########## QUERY ##########
import influxdb_client
token = "my-token"
org = "my-org"
url = "my-url"
bucket = "stocks_us"
retention_policy = "autogen"
client = influxdb_client.InfluxDBClient(url=url, token=token, org=org)
query_api = client.query_api()
measurement= "stock_daily_df"
with client:
"""
Querying ingested data
"""
query = 'from(bucket:"{}")' \
' |> range(start: 0, stop: now())' \
' |> filter(fn: (r) => r._measurement == "{}")' \
' |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")' \
' |> filter(fn: (r) => r["ticker"] == "AAPL")'\
' |> limit(n:10, offset: 0)'.format(bucket, measurement)
df = query_api.query_data_frame(query=query)
print(df)
Flux would do all of its work in UTC, which is a simple linear clock, and leave it to the user to figure out the display. Hence to keep the timestamp in consistent, we should convert the timestamp into UTC before inserting the data and convert the data result back to corresponding timestamp once we are done with the query.
1.Convert the timestamp into UTC in the yfinance library:
dt.replace(tzinfo=timezone.utc)
2.Convert the UTC timestamp to your local one:
import "timezone"
option location = timezone.location(name: "America/New_York")
See more details here.

How to handle large in-memory data in Apache Beam Pipeline to run on Google Dataflow Runner

I'm having a simple following code. The size of the variable word_to_id in memory is ~50MB. This causing error in submitting pipeline to Dataflow Runner.
413 Request Entity Too Large
word_to_id = {tok: idx for idx, tok in enumerate(vocab)}
def extract_word_ids(tokens):
return [word_to_id[w] for w in tokens if word_to_id.get(w, None)]
with beam.pipeline.Pipeline(
options=get_pipeline_option()) as p:
lines = p | 'Read' >> beam.io.ReadFromText(path)
word_ids = (
lines
| 'TokenizeLines' >> beam.Map(words)
| 'IntergerizeTokens' >> beam.Map(extract_word_ids)
)
Please provide me an alternate solution for this.
You can use GCS buckets as sources for both the text and the variable and use the variable as side input. You can use this side inputs as list, dict or singleton.
Here you have an example of a wordcount removing the stopwords, which are stored in a GCS bucket
with beam.Pipeline() as p:
path = "gs://dataflow-samples/shakespeare/kinglear.txt"
stopwords_path = "<BUCKET/stopwords>"
output_path = "<BUCKET>"
def split_words(text, stopwords):
words = re.split('\W+', text)
try:
words.remove('')
except:
pass
return [x for x in words if x.lower() not in stopwords]
stopwords_p = (p | "Read Stop Words" >> ReadFromText(stopwords_path)
| FlatMap(lambda x: x.split(", ")))
text = p | "Read Text" >> ReadFromText(path)
(text | "Split Words" >> FlatMap(split_words, stopwords=beam.pvalue.AsList(stopwords_p))
| "Count" >> Count.PerElement()
| "Write" >> WriteToText(file_path_prefix=output_path, file_name_suffix=".txt"))
Finally, I'm managed to solve it and it worked. I used DoFn.setup to initialize my variable from GCS bucket.
class IntergerizeTokens(beam.DoFn):
"""Beam line processing function."""
def __init__(self, vocab_filename):
self.vocab_filename = vocab_filename
def setup(self):
with tf.io.gfile.GFile(tf.io.gfile.glob(self.vocab_filename + '*')[0], 'r') as fh:
# read from GCS bucket
self.word_to_id = {tok: idx for idx, tok in enumerate(vocab)}
print('Setup done!')
def process(self, tokens):
"""Takes a lines and yield a list of (token, 1) tuples."""
return [[self.word_to_id[w] for w in tokens if self.word_to_id.get(w, None)]]
Now pass the DoFn in ParDo
with beam.pipeline.Pipeline(
options=get_pipeline_option()) as p:
lines = p | 'Read' >> beam.io.ReadFromText(path)
word_ids = (
lines
| 'TokenizeLines' >> beam.Map(words)
| 'IntergerizeTokens' >> beam.ParDo(IntergerizeTokens(vocab_temp_path))
)
This is one way to solve it. I think DoFn.setup is good for initializing large variables in memory.

In Kubeflow Pipelines, how to send a list of elements to a lightweight python component?

I am trying to send a list of elements as a PipelineParameter to a lightweight component.
Here is a sample that reproduces the problem. Here is the function:
def my_func(my_list: list) -> bool:
print(f'my_list is {my_list}')
print(f'my_list is of type {type(my_list)}')
print(f'elem 0 is {my_list[0]}')
print(f'elem 1 is {my_list[1]}')
return True
And if I execute it with this:
test_data = ['abc', 'def']
my_func(test_data)
It behaves as expected:
my_list is ['abc', 'def']
my_list is of type <class 'list'>
elem 0 is abc
elem 1 is def
but if I wrap it in an op and and set up a pipeline:
import kfp
my_op = kfp.components.func_to_container_op(my_func)
#kfp.dsl.pipeline()
def my_pipeline(my_list: kfp.dsl.PipelineParam = kfp.dsl.PipelineParam('my_list', param_type=kfp.dsl.types.List())):
my_op(my_list)
kfp.compiler.Compiler().compile(my_pipeline, 'my_pipeline.zip')
And then run a pipeline:
client = kfp.Client()
experiment = client.create_experiment('Default')
client.run_pipeline(experiment.id, 'my job', 'my_pipeline.zip', params={'my_list': test_data})
Then it seems at some point my list was converted to a string!
my_list is ['abc', 'def']
my_list is of type <class 'str'>
elem 0 is [
elem 1 is '
Here is a workaround I discovered, serializing arguments as a json string. Not sure this is really the best way...
The bare function becomes:
def my_func(json_arg_str: str) -> bool:
import json
args = json.loads(json_arg_str)
my_list = args['my_list']
print(f'my_list is {my_list}')
print(f'my_list is of type {type(my_list)}')
print(f'elem 0 is {my_list[0]}')
print(f'elem 1 is {my_list[1]}')
return True
Which still works as long as you pass the args as a json string instead of a list:
test_data = '{"my_list":["abc", "def"]}'
my_func(test_data)
Which produces expected results:
my_list is ['abc', 'def']
my_list is of type <class 'list'>
elem 0 is abc
elem 1 is def
And now the pipeline is changed to accept a str instead of a PipelineParam of type kfp.dsl.types.List:
import kfp
my_op = kfp.components.func_to_container_op(my_func)
#kfp.dsl.pipeline()
def my_pipeline(json_arg_str: str):
my_op(json_arg_str)
kfp.compiler.Compiler().compile(my_pipeline, 'my_pipeline.zip')
Which, when executed like this:
client = kfp.Client()
experiment = client.create_experiment('Default')
client.run_pipeline(experiment.id, 'my job', 'my_pipeline.zip', params={'json_arg_str': test_data})
Produces the same result:
my_list is ['abc', 'def']
my_list is of type <class 'list'>
elem 0 is abc
elem 1 is def
Although it works, I nevertheless find this workaround annoying. What then is the point of kfp.dsl.types.List, if not for allowing a PipelineParam that is a List?
Currently the best option seems to be serializing the arguments. There is one issue related to this: https://github.com/kubeflow/pipelines/issues/1901

AssertionError on trying to add new entity using matcher on spaCy

I'm trying to match all e-mail like looking text in a bunch of documents and add it to custom NER label called 'EMAIL'.
Here is the code for a test case.
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john#ymail.com and an alternate is john#gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
Here's what I get when I run this code.
Traceback (most recent call last):
File "C:/Python27/emailpii.py", line 26, in <module>
matches = matcher(doc)
File "matcher.pyx", line 407, in spacy.matcher.Matcher.__call__
File "C:/Python27/emailpii.py", line 19, in add_event_ent
doc.ents += ((EMAIL, start, end),)
File "doc.pyx", line 415, in spacy.tokens.doc.Doc.ents.__get__
File "span.pyx", line 61, in spacy.tokens.span.Span.__cinit__
AssertionError: 17587345535198158200
However, on running a similar example
import spacy
print "*****************"
print(spacy.__version__)
print "*****************"
from spacy.matcher import Matcher
#from spacy import displacy
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EVENT = nlp.vocab.strings['EVENT']
def add_event_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EVENT, start, end),)
matcher.add('GoogleIO', add_event_ent,
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}],
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}, {'IS_DIGIT': True}])
text = u"Google I/O was great this year. See you all again in Google I/O 2018"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
#displacy.serve(doc, style = 'ent')
I get the output as desired:
2.0.1
(0, Google I/O)
(1, Google I/O)
(2, Google I/O 2018)
(u'Google I/O', u'EVENT')
(u'this year', u'DATE')
(u'Google I/O 2018', u'EVENT')
Am I missing something here?
I believe your first code fails because you have not added an Entity label for 'EMAIL'. The second code works because EVENT is a pre-existing Entity type.
The documentation is not very clear on what the first argument of the matcher.add() method actually does, but it adds an Entity label for you. Here are two alternatives that should work and clear up the confusion:
Alternative 1:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
#EMAIL = nlp.vocab.strings['EMAIL'] #Not needed
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((match_id, start, end),)
matcher.add('EMAIL', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john#ymail.com and an alternate is john#gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
Alternative 2 (I'm not sure why you'd want to do it this way because you end up with two entity labels serving essentially the same purpose, but provided just for illustration purposes):
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRecognizer
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
ner = EntityRecognizer(nlp.vocab)
ner.add_label('EMAIL')
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john#ymail.com and an alternate is john#gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)

Use a string of expressions in a method for python

For example, if I have the string:
"id=100 id2=200 id3=300 ..."
where the variable names, values, and number of expressions can be anything.
How can I then use that string in a method that is used like this:
method(id=100,id2=200,id3=300,...)
I get the string from a command line argument.
We parse them iteratively:
pairs = "id=100 id2=200 id3=300".split(' ')
res = {}
for p in pairs:
k,v = p.rsplit('=', 1)
res[k] = v
print res # prints {'id2': '200', 'id': '100', 'id3': '300'}
# now we can send the dictionary to the method
You can first convert it to a dictionary:
>>> s = "id=100 id2=200 id3=300"
>>> d = dict(a.split('=') for a in s.plit())
>>> print d
{'id2': '200', 'id': '100', 'id3': '300'}
And now use it in functions:
>>> method(**d)

Resources