Setting timezone in AsyncIOScheduler - timezone

I'm in the Pacific timezone and I'm creating a discord bot to send a message at 8am in CENTRAL time.
import os
import discord
from discord.ext import commands
from dotenv import load_dotenv
from rich import print
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
load_dotenv()
TOKEN = os.getenv('DISCORD_TOKEN')
intents = discord.Intents.default()
intents.members = True
bot = commands.Bot(command_prefix = '!', intents=intents)
# Will become the good morning message
async def gm():
c = bot.get_channel(channel_id_removed)
await c.send("This will be the good morning message.")
#bot.event
async def on_ready():
for guild in bot.guilds:
print(
f'{bot.user} is connected to the following guild:\n'
f'\t{guild.name} (id: {guild.id})'
)
#initializing scheduler for time of day sending
scheduler = AsyncIOScheduler()
# Attempts to set the timezone
# scheduler = AsyncIOScheduler(timezone='America/Chicago')
# scheduler = AsyncIOScheduler({'apscheduler.timezone': 'America/Chicago'})
# scheduler.configure(timezone='America/Chicago')
# Set the time for sending
scheduler.add_job(gm, CronTrigger(hour="6", minute="0", second="0"))
#starting the scheduler
scheduler.start()
#bot.event
async def on_member_join(member):
general_channel = None
guild_joined = member.guild
print(guild_joined)
general_channel = discord.utils.get(guild_joined.channels, name='general')
print(f'General Channel ID: {general_channel}')
if general_channel:
embed=discord.Embed(title="Welcome!",description=f"Welcome to The Dungeon {member.mention}!!")
await general_channel.send(embed=embed)
bot.run(TOKEN)
Environment:
Windows 10
Python 3.10.4
APScheduler 3.9.1
pytz 2022.1
pytz-deprecation-shim 0.1.0.post0
tzdata 2022.1
tzlocal 4.2
I'm just wondering if I'm doing something wrong? Or if what I'm trying to do simply isn't supported? It works if I use my local time so I know the function is ok.

You are using the asyncio scheduler but you're not running an asyncio event loop, so there is no way this could work. Copy/paste from the provided example:
from datetime import datetime
import asyncio
import os
from apscheduler.schedulers.asyncio import AsyncIOScheduler
def tick():
print('Tick! The time is: %s' % datetime.now())
if __name__ == '__main__':
scheduler = AsyncIOScheduler()
scheduler.add_job(tick, 'interval', seconds=3)
scheduler.start()
print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
# Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
try:
asyncio.get_event_loop().run_forever()
except (KeyboardInterrupt, SystemExit):
pass
The reason it is not working is because, while scheduler.start() instantiates an event loop as a side effect, it expects the loop to be run elsewhere so that the scheduler can do its work.

Related

airflow tasks in specific batches

I want to run a set of tasks like this:
a >> [b,c,d] >> [e,f,g] >> [h,i,j,k,l,m]
First run task a, when that is done, run b,c,d in parallel, then when the last of b,c,d is done. start running e,f,g in parallel etc.
But i'm getting an error with unsupported operand type(s) for >>: 'list' and 'list'
what is the correct syntax for what I want to do?
The error you are getting is related to the fact that dependencies between lists using bitwise operator are not supported, [task_a, task_b] >> [task_c, task_d] won't work.
IMHO the easiest and cleaner way to achieve what you are looking for (there are others) is to use TaskGroup and set depenencies between them, like this:
Graph view:
from time import sleep
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.task_group import TaskGroup
default_args = {
'start_date': days_ago(1)
}
def _execute_task(**kwargs):
print(f"Task_id: {kwargs['ti'].task_id}")
sleep(10)
def _create_python_task(name):
return PythonOperator(
task_id=f'task_{name}',
python_callable=_execute_task)
with DAG('parallel_tasks_example', schedule_interval='#once',
default_args=default_args, catchup=False) as dag:
task_a = DummyOperator(task_id='task_a')
with TaskGroup('first_group') as first_group:
for name in list('bcd'):
task = _create_python_task(name)
with TaskGroup('second_group') as second_group:
for name in list('efg'):
task = _create_python_task(name)
with TaskGroup('third_group') as third_group:
for name in list('hijk'):
task = _create_python_task(name)
task_a >> first_group >> second_group >> third_group
From TaskGroup class definition:
A collection of tasks. When set_downstream() or set_upstream() are called on
the TaskGroup, it is applied across all tasks within the group if necessary.
You can find an official example about here .

Display progress on dask.compute(*something) call

I have the following structure on my code using Dask:
#dask.delayed
def calculate(data):
services = data.service_id
prices = data.price
return [services, prices]
output = []
for qid in notebook.tqdm(ids):
r = calculate(parts[parts.quotation_id == qid])
output.append(r)
Turns out that, when I call the dask.compute() method over my output list, I don't have any progress indication. The Diagnostic UI don't "capture" this action, and I'm not even sure that's properly running (judging by my processor usage, I think it's not).
result = dask.compute(*output)
I'm following the "best practices" article from the dask's documentation:
https://docs.dask.org/en/latest/delayed-best-practices.html
What I'm missing?
Edit: I think it's running, because I still got memory leak/high usage warnings. Still no progress indication.
As pointed out in the related post, dask has two methods for displaying the progress: one for "normal" dask, and one for dask.distributed.
Here's a reproducible example:
import random
from time import sleep
import dask
from dask.diagnostics import ProgressBar
from dask.distributed import Client, progress
# simulate work
#dask.delayed
def work(x):
sleep(x)
return True
# generate tasks
random.seed(42)
tasks = [work(random.randint(1,5)) for x in range(50)]
Using plain dask
ProgressBar().register()
dask.compute(*tasks)
produces:
using dask.distributed
client = Client()
futures = client.compute(tasks)
progress(futures)
produces:

How to let all worker do same task in dask?

I want to let all workers do same task ,like this:
from dask import distributed
from distributed import Client,LocalCluster
import dask
import socket
def writer(filename,data):
with open(filename,'w') as f:
f.writelines(data)
def get_ip(x):
return socket.gethostname()
#writer('/data/1.txt',a)
client = Client('192.168.123.1:8786')
A=client.submit(get_ip, 0,workers=['w1','w2'], pure=False)
print(client.ncores(),
client.scheduler_info()
# dask.config.get('distributed')
)
A.result()
i have 2 workers,but just print 1 workers'hostname
A simple way to achieve what you want is to use the Client.run method
client.run(socket.gethostname)
This runs the function on all workers and returns all results. It does not use the normal task scheduling system, which is designed for a very different purpose from what you want.

How to explicitly stop a running/live task through dask.?

I have a simple task which is scheduled by dask-scheduler and is running on a worker node.
My requirement is, I want to have the control to stop the task on demand as and when the user wants..
You will have to build this into your task, perhaps by explicitly checking a distributed Variable object in a loop.
from dask.distributed import Variable
stop = Variable()
stop.set(False)
def my_task():
while True:
if stop.get():
return
else:
# do stuff
future = client.submit(my_task)
# wait
stop.set(True)
You will need something explicit like this. Tasks are normally run in separate threads. As far as I know there is no way to interrupt a thread (though I would be happy to learn otherwise).
#MRocklin. thanks for your suggestion.. and here is the machinery that I've built around explicit stopping of the running/live task. Although the below code is not re-factored.. kindly trace the logic behind it.. Thanks - Manoranjan (I will mark your answer was really helpful..) :) keep doing good..
import os
import subprocess
from dask.distributed import Variable, Client
from multiprocessing import Process, current_process
import time
global stop
def my_task(proc):
print("my_task..")
print("child proc::", proc)
p = None
childProcessCreated = False
while True:
print("stop.get()::", stop.get())
if stop.get():
print("event triggered for stopping the live task..")
p.terminate()
return 100
else:
if childProcessCreated == False:
print("childProcessCreated::", childProcessCreated)
p = subprocess.Popen("python sleep.py", shell=False)
childProcessCreated = True
print("subprocess p::", p, " type::", type(p))
time.sleep(1)
print("returnning with 20")
return 20
if __name__ == '__main__':
clienta = Client("192.168.1.2:8786")
print("global declaration..")
global stop
stop = Variable("name-xx", client = clienta)
stop.set(False)
future = clienta.submit(my_task, 10)
print("future::waiting for 4 sec..in client side", future)
time.sleep(3)
print("future after sleeping for sec", future)
#print("result::", future.result())
stop.set(True)
print("future after stopping the child process::", future)
print("child process should be stopped by now..")
#print("future::", future)
#print("future result::",future.result())
print("over.!")

Is there any fast and efficient way to get abstracts from pubmed?

I would like to download large scientific abstract data for lets say about 2000 Pubmed IDs. My python code is sloppy and seems rather slow working. Is there any fast and efficient method to do harvest these abstracts?
If this is the fastest method how do I measure it so I become able compare against others or home against work situation (different ISP may play part in speed)?
Attached my code below.
import sqlite3
from Bio.Entrez import read,efetch,email,tool
from metapub import PubMedFetcher
import pandas as pd
import requests
from datetime import date
import xml.etree.ElementTree as ET
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
Abstract_data = pd.DataFrame(columns=["name","pmid","abstract"])
def abstract_download(self,dict_pmids):
"""
This method returns abstract for a given pmid and add to the abstract data
"""
index=0
baseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
for names in dict_pmids:
for pmid in dict_pmids[names]:
try:
abstract = []
url = baseUrl+"efetch.fcgi?db=pubmed&id="+pmid+"&rettype=xml"+
response=requests.request("GET",url,timeout=500).text
response=response.encode('utf-8')
root=ET.fromstring(response)
root_find=root.findall('./PubmedArticle/MedlineCitation/Article/Abstract/')
if len(root_find)==0:
root_find=root.findall('./PubmedArticle/MedlineCitation/Article/ArticleTitle')
for i in range(len(root_find)):
if root_find[i].text != None:
abstract.append(root_find[i].text)
if abstract is not None:
Abstract_data.loc[index]=names,pmid,"".join(abstract)
index+=1
except:
print "Connection Refused"
time.sleep(5)
continue
return Abstract_data
EDIT: The general error that occurs for this script is seemingly a "Connection Refused". See the answer of ZF007 below how this was solved.
The below code works. Your script hang on malformed URL construction. Also if things went wrong inside the script the response was a refused connection. This was infact not the case because it was the code that did the processing of the retrieved data.. I've made some adjustments to get the code working for me and left comments in place where you need to adjust due to the lack of the dict_pmids list.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, time, requests, sqlite3
import pandas as pd
import xml.etree.ElementTree as ET
from metapub import PubMedFetcher
from datetime import date
from Bio.Entrez import read,efetch,email,tool
def abstract_download(pmids):
"""
This method returns abstract for a given pmid and add to the abstract data
"""
index = 0
baseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
collected_abstract = []
# code below diabled to get general abstract extraction from pubmed working. I don't have the dict_pmid list.
"""
for names in dict_pmids:
for pmid in dict_pmids[names]:
move below working code to the right to get it in place with above two requirements prior to providing dict_pmid list.
# from here code works upto the next comment. I don't have the dict_pmid list.
"""
for pmid in pmids:
print 'pmid : %s\n' % pmid
abstract = []
root = ''
try:
url = '%sefetch.fcgi?db=pubmed&id=%s&rettype=xml' % (baseUrl, pmid)
# checks my url... line to parse into a webbrowser like firefox.
print 'url', url
response = requests.request("GET", url, timeout=500).text
# check if I got a response.
print 'response', response
# response = response.encode('utf-8')
root = ET.fromstring(response)
except Exception as inst:
# besides a refused connection.... the "why" it was connected comes in handly to resolve issues at hand
# if and when they happen.
print "Connection Refused", inst
time.sleep(5)
continue
root_find = root.findall('./PubmedArticle/MedlineCitation/Article/Abstract/')
if len(root_find)==0:
root_find = root.findall('./PubmedArticle/MedlineCitation/Article/ArticleTitle')
# check if I found something
print 'root_find : %s\n\n' % root_find
for i in range(len(root_find)):
if root_find[i].text != None:
abstract.append(root_find[i].text)
Abstract_data = pd.DataFrame(columns=["name","pmid","abstract"])
# check if I found something
#print 'abstract : %s\n' % abstract
# code works up to the print statement ''abstract', abstract' teh rest is disabled because I don't have the dict_pmid list.
if abstract is not None:
# Abstract_data.loc[index] = names,pmid,"".join(abstract)
index += 1
collected_abstract.append(abstract)
# change back return Abstract_data when dict_pmid list is administered.
# return Abstract_data
return collected_abstract
if __name__ == '__main__':
sys.stdout.flush()
reload(sys)
sys.setdefaultencoding('utf8')
pubmedIDs = range(21491000, 21491001)
mydata = abstract_download(pubmedIDs)
print 'mydata : %s' % (mydata)

Resources