write date from yfinance Timestamp to influxdb and query the date - timezone support - influxdb

I am trying to write date/time data to influxdb and query the data to a dataframe.
when I write the data date time looks like this...
ticker= 'AAPL'
import yfinance as yf
df = yf.Ticker('AAPL').history(period="1d").index[0]
print(df)
output:
Timestamp('2023-01-05 00:00:00-0500', tz='America/New_York')
...and when i query the data to a dataframe and print it I get this:
df['_time']
output:
0 2023-01-05 05:00:00+00:00
Name: _time, dtype: datetime64[ns, tzutc()]
What do I need to do to properly write the time in influxdb?
please see full code below:
########## WRITE ##########
import yfinance as yf
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS, PointSettings
token = "my-token"
org = "my-org"
url = "my-url"
bucket = "stocks_us"
retention_policy = "autogen"
client = influxdb_client.InfluxDBClient(url=url, token=token, org=org)
write_api = client.write_api(write_options=SYNCHRONOUS)
df = yf.Ticker('AAPL').history(period="1d")
with client:
"""
Ingest DataFrame with default tags
"""
point_settings = PointSettings(**{"ticker": ticker})
write_api = client.write_api(write_options=SYNCHRONOUS,
point_settings=point_settings)
write_api.write(bucket=bucket,
org= "dev",
record=df,
data_frame_measurement_name="stock_daily_df")
client.close()
print(df)
and
########## QUERY ##########
import influxdb_client
token = "my-token"
org = "my-org"
url = "my-url"
bucket = "stocks_us"
retention_policy = "autogen"
client = influxdb_client.InfluxDBClient(url=url, token=token, org=org)
query_api = client.query_api()
measurement= "stock_daily_df"
with client:
"""
Querying ingested data
"""
query = 'from(bucket:"{}")' \
' |> range(start: 0, stop: now())' \
' |> filter(fn: (r) => r._measurement == "{}")' \
' |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")' \
' |> filter(fn: (r) => r["ticker"] == "AAPL")'\
' |> limit(n:10, offset: 0)'.format(bucket, measurement)
df = query_api.query_data_frame(query=query)
print(df)

Flux would do all of its work in UTC, which is a simple linear clock, and leave it to the user to figure out the display. Hence to keep the timestamp in consistent, we should convert the timestamp into UTC before inserting the data and convert the data result back to corresponding timestamp once we are done with the query.
1.Convert the timestamp into UTC in the yfinance library:
dt.replace(tzinfo=timezone.utc)
2.Convert the UTC timestamp to your local one:
import "timezone"
option location = timezone.location(name: "America/New_York")
See more details here.

Related

ChildProcess close with all studio code 1

I was trying to deploy ml model using node_js with help of ChildProcess package ,while running __predict(), it is taking too long and end with code_1 error.
Here I share all related code to decode the issue :
Model python code -->
import keras
import time
start = time.time()
encoder = keras.models.load_model('enc', compile = False)
decoder = keras.models.load_model('dec', compile = False)
import numpy as np
from flask import Flask, request, jsonify , render_template
import tensorflow as tf
import pickle
import string
import re
from keras_preprocessing.sequence import pad_sequences
def initialize_hidden_state():
return tf.zeros((1, 1024))
eng_tokenizer , hin_tokenizer = pickle.load( open('tokenizer.pkl','rb'))
def clean(text):
text = text.lower()
special_char = set(string.punctuation+'।') # Set of all special characters
# Remove all the special characters
text = ''.join(word for word in text if word not in special_char)
seq = eng_tokenizer.texts_to_sequences([text])
seq = pad_sequences(seq, maxlen=23, padding='post')
return seq
def __predict(data):
# Get the data from the POST request.
#data = request.get_json(force=True)
clean_input = clean(data)
# Make prediction using model loaded from disk as per the data.
hidden_enc = initialize_hidden_state()
enc_out, enc_hidden = encoder(clean_input, hidden_enc)
result = ''
dec_hidden = enc_hidden
dec_input = tf.expand_dims(hin_tokenizer.texts_to_sequences(['<Start>'])[0], 0)
#------------------------------------------------------------------
for t in range(25):
predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
predicted_id = tf.argmax(predictions[0]).numpy()
x = hin_tokenizer.sequences_to_texts([[predicted_id]])[0]
if x == 'end':
break
result += x + ' '
# the predicted ID is fed back into the model
dec_input = tf.expand_dims([predicted_id], 0)
CLEANR = re.compile(r"([A-Za-z])", re.DOTALL)
result = re.sub(CLEANR, '', result)
return result
# import json
# with open('data.json', 'r') as openfile:
# json_object = json.load(openfile).get('data')
data =__predict("file")
end= time.time()
# print(start-end)
data1 = data +"abcd"
print(data1)
# print("abcd")
# dictionary = {
# "data": data,
# }
# json_object = json.dumps(dictionary, indent=2)
# with open("result.json", "w") as outfile:
# outfile.write(json_object)
When I type print("abcd") or print(start-end), it is giving result ,ending with code_0. But when I type print("data") not giving any result and ending with code_1 .
Here is the childProcess code -->
app.get('/', (req, res) => {
let dataToSend
let largeDataSet = []
// spawn new child process to call the python script
const python = spawn('python', ['app.py'])
// console.log(python);
// collect data from script
python.stdout.on('data', function (data) {
console.log('Pipe data from python script ...')
//dataToSend = data;
largeDataSet.push(data)
})
// in close event we are sure that stream is from child process is closed
python.on('close', (code) => {
console.log(`child process close all stdio with code ${code}`)
// send data to browser
// largeDataSet = []
console.log(largeDataSet.join(''));
res.send(largeDataSet.join(''))
})
})
Here is the error --->
child process close all stdio with code 1
Pls help , I tried to understand the problem but failed severely even in understanding it.
Thanks in advance !!!

Is there a function to get log and pct change?

I would like to compare log and pct change f the two symbols, but the following error appears:
KeyError: 'Adj Close'
import datetime
import pandas as pd
import numpy as np
import yfinance as yf
start = datetime.datetime(2017, 10, 1)
end = datetime.datetime.now()
symbols = ['BTC-USD', 'ETH-USD']
df = pd.DataFrame()
for i in symbols:
data = yf.download(i, start=None, end=None,show_errors=("True"),
period="4y", interval="1mo")
df[i] = data['Adj Close'].pct_change().dropna()
df['log_stuff'] = \
np.log(df['Adj Close'].astype('float64')/df['Adj Close'].astype('float64').shift(1))
df[['pct_change', 'log_stuff','df']].plot();
You could try the following. Please note, that you can also pass a list to download(), so no loops are required.
import numpy as np
import pandas as pd
import yfinance as yf
symbols = ['BTC-USD', 'ETH-USD']
data = yf.download(symbols, period="4y", interval="1mo")
# calculate pct return
pct_data = data['Adj Close'].pct_change()
pct_data = pct_data.add_suffix('_pct')
# calculate log returns
log_data = np.log(data['Adj Close']) - np.log(data['Adj Close'].shift(1))
log_data = log_data.add_suffix('_log')
# combine returns and drop na values
combined_data = pd.concat([pct_data,log_data], axis=1).dropna()
print(combined_data)
This will yield the following output:
BTC-USD_pct ETH-USD_pct BTC-USD_log ETH-USD_log
Date
2017-12-01 0.383326 0.692483 0.324490 0.526197
2018-01-01 -0.277987 0.477813 -0.325713 0.390564
2018-02-01 0.017298 -0.235276 0.017150 -0.268240
...

Change Named Entity Recognition Format from ENAMEX to CoNLL

I have a dataset which is in ENAMEX format like this:
<ENAMEX TYPE="LOCATION">Italy</ENAMEX>'s business world was rocked by the announcement <TIMEX TYPE="DATE">last Thursday</TIMEX> that Mr. <ENAMEX TYPE=„PERSON">Verdi</ENAMEX> would leave his job as vicepresident of <ENAMEX TYPE="ORGANIZATION">Music Masters of Milan, Inc</ENAMEX> to become operations director of <ENAMEX TYPE="ORGANIZATION">Arthur Andersen</ENAMEX>.
I want to change it into CoNLL format:
Italy LOCATION
's O
business O
world O
was O
rocked O
by O
the O
announcement O
last DATE
Thursday DATE
...
. O
How can I do that? Is there a standard script for such format conversion?
I wrote one myself that worked for me though is not heavily tested here:
from __future__ import unicode_literals
import os
from os import path
import re
import os
import re
import en_core_web_sm #spacy
# to convert formats such as <ENAMEX type="LOCATION">Italy</ENAMEX> is experiencing an economic boom.
def xml_iter(file_):
with open(file_, 'r') as fin:
for line in fin:
yield line.strip()
def markupline2bio(line):
#print(line.split('\t')[0])
record = line.split('\t')[0]
#print(record)
#print(parse(record))
#print(record[35:40], record[81:90])
#tags = re.findall(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record)
prev_start = 0
prev_end = 0
all_tokens = []
all_tags = []
for f in re.finditer(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record):
#print(record[f.start(0):f.end(0)], f.start(0), f.end(0))
annotations = re.findall(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record[f.start(0):f.end(0)])
before_text = record[prev_end:f.start(0)]
prev_start, prev_end = f.start(0), f.end(0)
for tok in nlp(before_text):
if str(tok).strip():
all_tokens.append(tok)
all_tags.append('O')
for phrasetag in annotations:
tag, phrase = annotations[0]
tokens = nlp(phrase)
for entity_tok_index, tok in enumerate(tokens):
if str(tok).strip():
all_tokens.append(tok)
if entity_tok_index == 0:
all_tags.append("B-" + tag)
else:
all_tags.append("I-" + tag)
else:
entity_tok_index -= 1
after_text = record[prev_end:]
for tok in nlp(after_text):
if str(tok).strip():
all_tokens.append(tok)
all_tags.append('O')
return all_tokens, all_tags
if __name__ == '__main__':
data_dir = './data/indonesian_bert_all/Indonesian/ner/'
xml_iterator = xml_iter(os.path.join(data_dir, 'data_train_ugm.txt'))
output_file = os.path.join(data_dir, 'data_train_ugm.bio')
#nlp = spacy.load("en_core_web_sm")
nlp = en_core_web_sm.load()
with open(output_file, 'w') as fout:
for i, line in enumerate(xml_iterator):
if i > 10:
#break
pass
all_tokens, all_tags = markupline2bio(line.strip())
#print(all_tokens)
#print(all_tags)
#print(line)
for tok, tag in zip(all_tokens, all_tags):
#print(tok, tag)
fout.write(str(tok) + '\t' + tag)
fout.write('\n')
fout.write('\n')

how to convert pandas str.split call to to dask

I have a dask data frame where the index is a string which looks like this:
12/09/2016 00:00;32.0046;-106.259
12/09/2016 00:00;32.0201;-108.838
12/09/2016 00:00;32.0224;-106.004
(its basically a string encoding the datetime;latitude;longitude of the row)
I'd like to split that while still in the dask context to individual columns representing each of the fields.
I can do that with a pandas dataframe as:
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
But that doesn't work in dask for several of the attempts I've tried. If I directly substitute the df for a dask df I get the error:
'Index' object has no attribute 'str'
If I use the column name instead of index as:
forecastDf['date'], forecastDf['Lat'], forecastDf['Lon'] = forecastDf['dateLocation'].str.split(';', 2).str
I get the error:
TypeError: 'StringAccessor' object is not iterable
Here is an runnable example of this working in Pandas
import pandas as pd
df = pd.DataFrame()
df['dateLocation'] = ['12/09/2016 00:00;32.0046;-106.259','12/09/2016 00:00;32.0201;-108.838','12/09/2016 00:00;32.0224;-106.004']
df = df.set_index('dateLocation')
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
df.head()
Here is the error I get if I directly convert that to dask
import dask.dataframe as dd
dd = dd.from_pandas(df, npartitions=1)
dd['date'], dd['Lat'], dd['Lon'] = dd.index.str.split(';', 2).str
>>TypeError: 'StringAccessor' object is not iterable
forecastDf['date'] = forecastDf['dateLocation'].str.partition(';')[0]
forecastDf['Lat'] = forecastDf['dateLocation'].str.partition(';')[2]
forecastDf['Lon'] = forecastDf['dateLocation'].str.partition(';')[4]
Let me know if this works for you!
First make sure the column is string dtype
forecastDD['dateLocation'] = forecastDD['dateLocation'].astype('str')
Then you can use this to split in dask
splitColumns = client.persist(forecastDD['dateLocation'].str.split(';',2))
You can then index the columns in the new dataframe splitColumns and add them back to the original data frame.
forecastDD = forecastDD.assign(Lat=splitColumns.apply(lambda x: x[0], meta=('Lat', 'f8')), Lon=splitColumns.apply(lambda x: x[1], meta=('Lat', 'f8')), date=splitColumns.apply(lambda x: x[2], meta=('Lat', np.dtype(str))))
Unfortunately I couldn't figure out how to do it without calling compute and creating the temp dataframe.

How can I check the memory usage of an iPython notebook? [duplicate]

The memory on my lab's server (Ubuntu) is constantly filling up due to users never shutting down old notebooks. I would like to get a better idea of how much memory each notebook is taking up. I can summarize (rough) memory usage for all jupyter notebooks run by each user, but I would like to get the total memory usage of each individual notebook so that I can shut down those particular memory hogs (or tell another user to shut his/her's down). I quickly put together the following code to get approx. mem. usage per jupyter kernel, but I don't know how to associate the kernel IDs to a particular notebook.
import os
import pwd
import pandas as pd
UID = 1
EUID = 2
pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]
df = []
for pid in pids:
try:
ret = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read()
except IOError: # proc has already terminated
continue
# jupyter notebook processes
if len(ret) > 0 and 'share/jupyter/runtime' in ret:
process = psutil.Process(int(pid))
mem = process.memory_info()[0]
# user name for pid
for ln in open('/proc/%d/status' % int(pid)):
if ln.startswith('Uid:'):
uid = int(ln.split()[UID])
uname = pwd.getpwuid(uid).pw_name
# user, pid, memory, proc_desc
df.append([uname, pid, mem, ret])
df = pd.DataFrame(df)
df.columns = ['user', 'pid', 'memory', 'proc_desc']
df
I made some improvements to sharchaea's script for portability and speed.
Mainly, only check ports that notebooks are running on, check different hostname options, improve the kernel process check and check for ipython or jupyter.
import argparse
import re
import subprocess
import pandas as pd
import psutil
import requests
import tabulate
kernel_regex = re.compile(r".+kernel-(.+)\.json")
notebook_regex = re.compile(r"(https?://([^:/]+):?(\d+)?)/?(\?token=([a-z0-9]+))?")
def get_proc_info():
pids = psutil.pids()
# memory info from psutil.Process
df_mem = []
for pid in pids:
try:
proc = psutil.Process(pid)
cmd = " ".join(proc.cmdline())
except psutil.NoSuchProcess:
continue
if len(cmd) > 0 and ("jupyter" in cmd or "ipython" in cmd) and "kernel" in cmd:
# kernel
kernel_ID = re.sub(kernel_regex, r"\1", cmd)
# memory
mem = proc.memory_info()[0] / float(1e9)
uname = proc.username()
# user, pid, memory, kernel_ID
df_mem.append([uname, pid, mem, kernel_ID])
df_mem = pd.DataFrame(df_mem)
df_mem.columns = ["user", "pid", "memory_GB", "kernel_ID"]
return df_mem
def get_running_notebooks():
notebooks = []
for n in subprocess.Popen(
["jupyter", "notebook", "list"], stdout=subprocess.PIPE
).stdout.readlines()[1:]:
match = re.match(notebook_regex, n.decode())
if match:
base_url, host, port, _, token = match.groups()
notebooks.append({"base_url": base_url, "token": token})
else:
print("Unknown format: {}".format(n.decode()))
return notebooks
def get_session_info(password=None):
df_nb = []
kernels = []
for notebook in get_running_notebooks():
s = requests.Session()
if notebook["token"] is not None:
s.get(notebook["base_url"] + "/?token=" + notebook["token"])
else:
# do a get to the base url to get the session cookies
s.get(notebook["base_url"])
if password is not None:
# Seems jupyter auth process has changed, need to first get a cookie,
# then add that cookie to the data being sent over with the password
data = {"password": password}
data.update(s.cookies)
s.post(notebook["base_url"] + "/login", data=data)
res = s.get(notebook["base_url"] + "/api/sessions")
if res.status_code != 200:
raise Exception(res.json())
for sess in res.json():
kernel_ID = sess["kernel"]["id"]
if kernel_ID not in kernels:
kernel = {
"kernel_ID": kernel_ID,
"kernel_name": sess["kernel"]["name"],
"kernel_state": sess["kernel"]["execution_state"],
"kernel_connections": sess["kernel"]["connections"],
# "notebook_url": notebook["base_url"] + "/notebook/" + sess["id"],
"notebook_path": sess["path"],
}
kernel.update(notebook)
df_nb.append(kernel)
kernels.append(kernel_ID)
df_nb = pd.DataFrame(df_nb)
del df_nb["token"]
return df_nb
def parse_args():
parser = argparse.ArgumentParser(description="Find memory usage.")
parser.add_argument("--password", help="password (only needed if pass-protected)")
return parser.parse_args()
def main(password=None, print_ascii=False):
df_mem = get_proc_info()
df_nb = get_session_info(password)
# joining tables
df = pd.merge(df_nb, df_mem, on=["kernel_ID"], how="inner")
df = df.sort_values("memory_GB", ascending=False).reset_index(drop=True)
if print_ascii:
print(tabulate.tabulate(df, headers=(df.columns.tolist())))
return df
if __name__ == "__main__":
args = vars(parse_args())
main(args["password"], print_ascii=True)
I'll probably continue to make updates to this at this gist
edit: Code has been updated to work with newer versions of Jupyter using token authentication, to leverage only psutil making it Windows compatible, and to work on Python 3.
I seemed to have figured out a working solution for my own problem:
import os
import pwd
import psutil
import re
import string
import json
import urllib2
import pandas as pd
UID = 1
EUID = 2
regex = re.compile(r'.+kernel-(.+)\.json')
pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]
# memory info from psutil.Process
df_mem = []
for pid in pids:
try:
ret = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read()
except IOError: # proc has already terminated
continue
# jupyter notebook processes
if len(ret) > 0 and 'share/jupyter/runtime' in ret:
# kernel
kernel_ID = re.sub(regex, r'\1', ret)
kernel_ID = filter(lambda x: x in string.printable, kernel_ID)
# memory
process = psutil.Process(int(pid))
mem = process.memory_info()[0] / float(1e9)
# user name for pid
for ln in open('/proc/{}/status'.format(int(pid))):
if ln.startswith('Uid:'):
uid = int(ln.split()[UID])
uname = pwd.getpwuid(uid).pw_name
# user, pid, memory, kernel_ID
df_mem.append([uname, pid, mem, kernel_ID])
df_mem = pd.DataFrame(df_mem)
df_mem.columns = ['user', 'pid', 'memory_GB', 'kernel_ID']
# notebook info from assessing ports
df_nb = []
for port in xrange(5000,30000):
sessions = None
try:
url = 'http://127.0.0.1:{}/api/sessions'.format(port)
sessions = json.load(urllib2.urlopen(url))
except urllib2.URLError:
sessions = None
if sessions:
for sess in sessions:
kernel_ID = str(sess['kernel']['id'])
notebook_path = sess['notebook']['path']
df_nb.append([port, kernel_ID, notebook_path])
df_nb = pd.DataFrame(df_nb)
df_nb.columns = ['port', 'kernel_ID', 'notebook_path']
# joining tables
df = pd.merge(df_nb, df_mem, on=['kernel_ID'], how='inner')
df.sort(['memory_GB'], ascending=False)

Resources