airflow tasks in specific batches - task

I want to run a set of tasks like this:
a >> [b,c,d] >> [e,f,g] >> [h,i,j,k,l,m]
First run task a, when that is done, run b,c,d in parallel, then when the last of b,c,d is done. start running e,f,g in parallel etc.
But i'm getting an error with unsupported operand type(s) for >>: 'list' and 'list'
what is the correct syntax for what I want to do?

The error you are getting is related to the fact that dependencies between lists using bitwise operator are not supported, [task_a, task_b] >> [task_c, task_d] won't work.
IMHO the easiest and cleaner way to achieve what you are looking for (there are others) is to use TaskGroup and set depenencies between them, like this:
Graph view:
from time import sleep
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.task_group import TaskGroup
default_args = {
'start_date': days_ago(1)
}
def _execute_task(**kwargs):
print(f"Task_id: {kwargs['ti'].task_id}")
sleep(10)
def _create_python_task(name):
return PythonOperator(
task_id=f'task_{name}',
python_callable=_execute_task)
with DAG('parallel_tasks_example', schedule_interval='#once',
default_args=default_args, catchup=False) as dag:
task_a = DummyOperator(task_id='task_a')
with TaskGroup('first_group') as first_group:
for name in list('bcd'):
task = _create_python_task(name)
with TaskGroup('second_group') as second_group:
for name in list('efg'):
task = _create_python_task(name)
with TaskGroup('third_group') as third_group:
for name in list('hijk'):
task = _create_python_task(name)
task_a >> first_group >> second_group >> third_group
From TaskGroup class definition:
A collection of tasks. When set_downstream() or set_upstream() are called on
the TaskGroup, it is applied across all tasks within the group if necessary.
You can find an official example about here .

Related

Setting timezone in AsyncIOScheduler

I'm in the Pacific timezone and I'm creating a discord bot to send a message at 8am in CENTRAL time.
import os
import discord
from discord.ext import commands
from dotenv import load_dotenv
from rich import print
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
load_dotenv()
TOKEN = os.getenv('DISCORD_TOKEN')
intents = discord.Intents.default()
intents.members = True
bot = commands.Bot(command_prefix = '!', intents=intents)
# Will become the good morning message
async def gm():
c = bot.get_channel(channel_id_removed)
await c.send("This will be the good morning message.")
#bot.event
async def on_ready():
for guild in bot.guilds:
print(
f'{bot.user} is connected to the following guild:\n'
f'\t{guild.name} (id: {guild.id})'
)
#initializing scheduler for time of day sending
scheduler = AsyncIOScheduler()
# Attempts to set the timezone
# scheduler = AsyncIOScheduler(timezone='America/Chicago')
# scheduler = AsyncIOScheduler({'apscheduler.timezone': 'America/Chicago'})
# scheduler.configure(timezone='America/Chicago')
# Set the time for sending
scheduler.add_job(gm, CronTrigger(hour="6", minute="0", second="0"))
#starting the scheduler
scheduler.start()
#bot.event
async def on_member_join(member):
general_channel = None
guild_joined = member.guild
print(guild_joined)
general_channel = discord.utils.get(guild_joined.channels, name='general')
print(f'General Channel ID: {general_channel}')
if general_channel:
embed=discord.Embed(title="Welcome!",description=f"Welcome to The Dungeon {member.mention}!!")
await general_channel.send(embed=embed)
bot.run(TOKEN)
Environment:
Windows 10
Python 3.10.4
APScheduler 3.9.1
pytz 2022.1
pytz-deprecation-shim 0.1.0.post0
tzdata 2022.1
tzlocal 4.2
I'm just wondering if I'm doing something wrong? Or if what I'm trying to do simply isn't supported? It works if I use my local time so I know the function is ok.
You are using the asyncio scheduler but you're not running an asyncio event loop, so there is no way this could work. Copy/paste from the provided example:
from datetime import datetime
import asyncio
import os
from apscheduler.schedulers.asyncio import AsyncIOScheduler
def tick():
print('Tick! The time is: %s' % datetime.now())
if __name__ == '__main__':
scheduler = AsyncIOScheduler()
scheduler.add_job(tick, 'interval', seconds=3)
scheduler.start()
print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
# Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
try:
asyncio.get_event_loop().run_forever()
except (KeyboardInterrupt, SystemExit):
pass
The reason it is not working is because, while scheduler.start() instantiates an event loop as a side effect, it expects the loop to be run elsewhere so that the scheduler can do its work.

When I run my Airflow, no task is created

I have a problem, the demo is very simple, but after deployment on the airflow, execution does not achieve the desired effect. Here's my code
"""
import pytz
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators.latest_only_operator import LatestOnlyOperator
from airflow.operators.python_operator import PythonOperator
tz = pytz.timezone('Asia/Shanghai')
dt = datetime.now(tz)
utc_dt = dt.astimezone(pytz.utc).replace(tzinfo=None)
default_args = {
'owner': 'syroot',
"start_date": utc_dt - timedelta(minutes=2),
"depends_on_past": False,
'email': ['zhaosw#sunnyoptical.com'],
'email_on_failure': False,
'email_on_retry': False,
"retries": 1,
"retry_delay": timedelta(seconds=5)
}
dag = DAG(
"demo1",
catchup=False,
default_args=default_args,
schedule_interval="*/2 * * * *",
)
def print_hello():
return 'Hello world!'
hello_operator = PythonOperator(
task_id='hello_task',
python_callable=print_hello,
dag=dag)
"""
But the results were not so good, dag run success, but no task create. I can not find any info in task instances menu, but I can find dag run log in DAG Runs menu.
I don't believe your scheduler is able to run the DAG due to your dynamic start date.
Try changing "start_date": utc_dt - timedelta(minutes=2), to a static date like "start_date": datetime(2019,12,9),. That should allow the scheduler to pick it up!
It's generally recommended not to set your start_date dynamically.
Taken from Airflow FAQ:
We recommend against using dynamic values as start_date, especially
datetime.now() as it can be quite confusing. The task is triggered
once the period closes, and in theory an #hourly DAG would never get
to an hour after now as now() moves along.
You have to specify which task execute. It looks like you are missing:
hello_operator
in your DAG:
def print_hello():
return 'Hello world!'
hello_operator = PythonOperator(
task_id='hello_task',
python_callable=print_hello,
dag=dag)
hello_operator # <-- add this

Dask opportunistic caching in custom graphs

I have a custom DAG such as:
dag = {'load': (load, 'myfile.txt'),
'heavy_comp': (heavy_comp, 'load'),
'simple_comp_1': (sc_1, 'heavy_comp'),
'simple_comp_2': (sc_2, 'heavy_comp'),
'simple_comp_3': (sc_3, 'heavy_comp')}
And I'm looking to compute the keys simple_comp_1, simple_comp_2, and simple_comp_3, which I perform as follows,
import dask
from dask.distributed import Client
from dask_yarn import YarnCluster
task_1 = dask.get(dag, 'simple_comp_1')
task_2 = dask.get(dag, 'simple_comp_2')
task_3 = dask.get(dag, 'simple_comp_3')
tasks = [task_1, task_2, task_3]
cluster = YarnCluster()
cluster.scale(3)
client = Client(cluster)
dask.compute(tasks)
cluster.shutdown()
It seems, that without caching, the computation of these 3 keys will lead to the computation of heavy_comp also 3 times. And since this is a heavy computation, I tried to implement opportunistic caching from here as follows:
from dask.cache import Cache
cache = Cache(2e9)
cache.register()
However, when I tried to print the results of what was being cached I got nothing:
>>> cache.cache.data
[]
>>> cache.cache.heap.heap
{}
>>> cache.cache.nbytes
{}
I even tried increasing the cache size to 6GB, however to no effect. Am I doing something wrong? How can I get Dask to cache the result of the key heavy_comp?
Expanding on MRocklin's answer and to format code in the comments below the question.
Computing the entire graph at once works as you would expect it to. heavy_comp would only be executed once, which is what you want. Consider the following code you provided in the comments completed by empty function definitions:
def load(fn):
print('load')
return fn
def sc_1(i):
print('sc_1')
return i
def sc_2(i):
print('sc_2')
return i
def sc_3(i):
print('sc_3')
return i
def heavy_comp(i):
print('heavy_comp')
return i
def merge(*args):
print('merge')
return args
dag = {'load': (load, 'myfile.txt'), 'heavy_comp': (heavy_comp, 'load'), 'simple_comp_1': (sc_1, 'heavy_comp'), 'simple_comp_2': (sc_2, 'heavy_comp'), 'simple_comp_3': (sc_3, 'heavy_comp'), 'merger_comp': (merge, 'sc_1', 'sc_2', 'sc_3')}
import dask
result = dask.get(dag, 'merger_comp')
print('result:', result)
It outputs:
load
heavy_comp
sc_1
sc_2
sc_3
merge
result: ('sc_1', 'sc_2', 'sc_3')
As you can see, "heavy_comp" is only printed once, showing that the function heavy_comp has only been executed once.
The opportunistic cache in the core Dask library only works for the single-machine scheduler, not the distributed scheduler.
However, if you just compute the entire graph at once Dask will hold onto intermediate values intelligently. If there are values that you would like to hold onto regardless you might also look at the persist function.

How to let all worker do same task in dask?

I want to let all workers do same task ,like this:
from dask import distributed
from distributed import Client,LocalCluster
import dask
import socket
def writer(filename,data):
with open(filename,'w') as f:
f.writelines(data)
def get_ip(x):
return socket.gethostname()
#writer('/data/1.txt',a)
client = Client('192.168.123.1:8786')
A=client.submit(get_ip, 0,workers=['w1','w2'], pure=False)
print(client.ncores(),
client.scheduler_info()
# dask.config.get('distributed')
)
A.result()
i have 2 workers,but just print 1 workers'hostname
A simple way to achieve what you want is to use the Client.run method
client.run(socket.gethostname)
This runs the function on all workers and returns all results. It does not use the normal task scheduling system, which is designed for a very different purpose from what you want.

Is there any fast and efficient way to get abstracts from pubmed?

I would like to download large scientific abstract data for lets say about 2000 Pubmed IDs. My python code is sloppy and seems rather slow working. Is there any fast and efficient method to do harvest these abstracts?
If this is the fastest method how do I measure it so I become able compare against others or home against work situation (different ISP may play part in speed)?
Attached my code below.
import sqlite3
from Bio.Entrez import read,efetch,email,tool
from metapub import PubMedFetcher
import pandas as pd
import requests
from datetime import date
import xml.etree.ElementTree as ET
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
Abstract_data = pd.DataFrame(columns=["name","pmid","abstract"])
def abstract_download(self,dict_pmids):
"""
This method returns abstract for a given pmid and add to the abstract data
"""
index=0
baseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
for names in dict_pmids:
for pmid in dict_pmids[names]:
try:
abstract = []
url = baseUrl+"efetch.fcgi?db=pubmed&id="+pmid+"&rettype=xml"+
response=requests.request("GET",url,timeout=500).text
response=response.encode('utf-8')
root=ET.fromstring(response)
root_find=root.findall('./PubmedArticle/MedlineCitation/Article/Abstract/')
if len(root_find)==0:
root_find=root.findall('./PubmedArticle/MedlineCitation/Article/ArticleTitle')
for i in range(len(root_find)):
if root_find[i].text != None:
abstract.append(root_find[i].text)
if abstract is not None:
Abstract_data.loc[index]=names,pmid,"".join(abstract)
index+=1
except:
print "Connection Refused"
time.sleep(5)
continue
return Abstract_data
EDIT: The general error that occurs for this script is seemingly a "Connection Refused". See the answer of ZF007 below how this was solved.
The below code works. Your script hang on malformed URL construction. Also if things went wrong inside the script the response was a refused connection. This was infact not the case because it was the code that did the processing of the retrieved data.. I've made some adjustments to get the code working for me and left comments in place where you need to adjust due to the lack of the dict_pmids list.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, time, requests, sqlite3
import pandas as pd
import xml.etree.ElementTree as ET
from metapub import PubMedFetcher
from datetime import date
from Bio.Entrez import read,efetch,email,tool
def abstract_download(pmids):
"""
This method returns abstract for a given pmid and add to the abstract data
"""
index = 0
baseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
collected_abstract = []
# code below diabled to get general abstract extraction from pubmed working. I don't have the dict_pmid list.
"""
for names in dict_pmids:
for pmid in dict_pmids[names]:
move below working code to the right to get it in place with above two requirements prior to providing dict_pmid list.
# from here code works upto the next comment. I don't have the dict_pmid list.
"""
for pmid in pmids:
print 'pmid : %s\n' % pmid
abstract = []
root = ''
try:
url = '%sefetch.fcgi?db=pubmed&id=%s&rettype=xml' % (baseUrl, pmid)
# checks my url... line to parse into a webbrowser like firefox.
print 'url', url
response = requests.request("GET", url, timeout=500).text
# check if I got a response.
print 'response', response
# response = response.encode('utf-8')
root = ET.fromstring(response)
except Exception as inst:
# besides a refused connection.... the "why" it was connected comes in handly to resolve issues at hand
# if and when they happen.
print "Connection Refused", inst
time.sleep(5)
continue
root_find = root.findall('./PubmedArticle/MedlineCitation/Article/Abstract/')
if len(root_find)==0:
root_find = root.findall('./PubmedArticle/MedlineCitation/Article/ArticleTitle')
# check if I found something
print 'root_find : %s\n\n' % root_find
for i in range(len(root_find)):
if root_find[i].text != None:
abstract.append(root_find[i].text)
Abstract_data = pd.DataFrame(columns=["name","pmid","abstract"])
# check if I found something
#print 'abstract : %s\n' % abstract
# code works up to the print statement ''abstract', abstract' teh rest is disabled because I don't have the dict_pmid list.
if abstract is not None:
# Abstract_data.loc[index] = names,pmid,"".join(abstract)
index += 1
collected_abstract.append(abstract)
# change back return Abstract_data when dict_pmid list is administered.
# return Abstract_data
return collected_abstract
if __name__ == '__main__':
sys.stdout.flush()
reload(sys)
sys.setdefaultencoding('utf8')
pubmedIDs = range(21491000, 21491001)
mydata = abstract_download(pubmedIDs)
print 'mydata : %s' % (mydata)

Resources