Is there an example I can see of accessing the stdout of an Exec node? I am trying to parse stdout similarly to Jenkins groovy dsl:
tobeparsed = sh(returnStdout: true, script: "myscript.sh")
That feature is not available in Conducto. When you want to send data from one node to another, there are a few ways to do so.
In these examples let's try to generate a random number in the first node, and have the second echo it.
A simple option is to use co.data to save the data in the first node, then get it from the second node.
import conducto as co
import random
img = co.Image(copy_dir=".")
def first():
# Generate a random number and save it to co.data,
# scoped to the current pipeline.
number = random.random()
co.data.pipeline.puts("random_number", str(number).encode())
print(f"Saved random number to co.data. Value: {number}")
def second():
# Read the value from co.data
stored = co.data.pipeline.gets("random_number")
number = float(stored.decode())
print(f"Read random number from co.data. Value: {number}")
def main() -> co.Serial:
with co.Serial(image=img) as output:
output["first"] = co.Exec("python pipeline.py first")
output["second"] = co.Exec("python pipeline.py second")
return output
if __name__ == '__main__':
co.main(default=main)
A more sophisticated way: the first node could be called from a co.Lazy, and generate the second node once it knows the value.
import conducto as co
import random
img = co.Image(copy_dir=".")
def first() -> co.Serial:
output = co.Serial()
# Generate a random number and encode it into command
# of second node
output["second"] = co.Exec(f"echo {random.random}")
return output
def main() -> co.Serial:
return co.Lazy("python pipeline.py first", image=img)
if __name__ == '__main__':
co.main(default=main)
In either case, save that text to a file called pipeline.py. Run it with python pipeline.py to see the tree it would generate. Run python pipeline.py --local to run it on your machine.
There's nothing magical about the name pipeline.py, just make sure it matches the commands in the co.Exec nodes.
Related
I am having trouble parallelizing code that reads some files and writes to neo4j.
I am using dask to parallelize the process_language_files function (3rd cell from the bottom).
I try to explain the code below, listing out the functions (First 3 cells).
The errors are printed at the end (Last 2 cells).
I am also listing environments and package versions at the end.
If I remove dask.delayed and run this code sequentially, its works perfectly well.
Thank you for your help. :)
==========================================================================
Some functions to work with neo4j.
from neo4j import GraphDatabase
from tqdm import tqdm
def get_driver(uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password=''):
"""Get a neo4j driver."""
connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=uri_scheme, host=host, port=port)
auth = (username, password)
driver = GraphDatabase.driver(connection_uri, auth=auth)
return driver
def format_raw_res(raw_res):
"""Parse neo4j results"""
res = []
for r in raw_res:
res.append(r)
return res
def run_bulk_query(query_list, driver):
"""Run a list of neo4j queries in a session."""
results = []
with driver.session() as session:
for query in tqdm(query_list):
raw_res = session.run(query)
res = format_raw_res(raw_res)
results.append({'query':query, 'result':res})
return results
global_driver = get_driver(uri_scheme='bolt', host='localhost', port='8687', username='neo4j', password='abc123') # neo4j driver object.=
This is how we create a dask client to parallelize.
from dask.distributed import Client
client = Client(threads_per_worker=4, n_workers=1)
The functions that the main code is calling.
import sys
import time
import json
import pandas as pd
import dask
def add_nodes(nodes_list, language_code):
"""Returns a list of strings. Each string is a cypher query to add a node to neo4j."""
list_of_create_strings = []
create_string_template = """CREATE (:LABEL {{node_id:{node_id}}})"""
for index, node in nodes_list.iterrows():
create_string = create_string_template.format(node_id=node['new_id'])
list_of_create_strings.append(create_string)
return list_of_create_strings
def add_relations(relations_list, language_code):
"""Returns a list of strings. Each string is a cypher query to add a relationship to neo4j."""
list_of_create_strings = []
create_string_template = """
MATCH (a),(b) WHERE a.node_id = {source} AND b.node_id = {target}
MERGE (a)-[r:KNOWS {{ relationship_id:{edge_id} }}]-(b)"""
for index, relations in relations_list.iterrows():
create_string = create_string_template.format(
source=relations['from'], target=relations['to'],
edge_id=''+str(relations['from'])+'-'+str(relations['to']))
list_of_create_strings.append(create_string)
return list_of_create_strings
def add_data(language_code, edges, features, targets, driver):
"""Add nodes and relationships to neo4j"""
add_nodes_cypher = add_nodes(targets, language_code) # Returns a list of strings. Each string is a cypher query to add a node to neo4j.
node_results = run_bulk_query(add_nodes_cypher, driver) # Runs each string in the above list in a neo4j session.
add_relations_cypher = add_relations(edges, language_code) # Returns a list of strings. Each string is a cypher query to add a relationship to neo4j.
relations_results = run_bulk_query(add_relations_cypher, driver) # Runs each string in the above list in a neo4j session.
# Saving some metadata
results = {
"nodes": {"results": node_results, "length":len(add_nodes_cypher),},
"relations": {"results": relations_results, "length":len(add_relations_cypher),},
}
return results
def load_data(language_code):
"""Load data from files"""
# Saving file names to variables
edges_filename = './edges.csv'
features_filename = './features.json'
target_filename = './target.csv'
# Loading data from the file names
edges = helper.read_csv(edges_filename)
features = helper.read_json(features_filename)
targets = helper.read_csv(target_filename)
# Saving some metadata
results = {
"edges": {"length":len(edges),},
"features": {"length":len(features),},
"targets": {"length":len(targets),},
}
return edges, features, targets, results
The main code.
def process_language_files(process_language_files, driver):
"""Reads files, creates cypher queries to add nodes and relationships, runs cypher query in a neo4j session."""
edges, features, targets, reading_results = load_data(language_code) # Read files.
writing_results = add_data(language_code, edges, features, targets, driver) # Convert files nodes and relationships and add to neo4j in a neo4j session.
return {"reading_results": reading_results, "writing_results": writing_results} # Return some metadata
# Execution starts here
res=[]
for index, language_code in enumerate(['ENGLISH', 'FRENCH']):
lazy_result = dask.delayed(process_language_files)(language_code, global_driver)
res.append(lazy_result)
Result from res. These are dask delayed objects.
print(*res)
Delayed('process_language_files-a73f4a9d-6ffa-4295-8803-7fe09849c068') Delayed('process_language_files-c88fbd4f-e8c1-40c0-b143-eda41a209862')
The errors. Even if use dask.compute(), I am getting similar errors.
futures = dask.persist(*res)
AttributeError Traceback (most recent call last)
~/Code/miniconda3/envs/MVDS/lib/python3.6/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
48 buffers.clear()
---> 49 result = pickle.dumps(x, **dump_kwargs)
50 if len(result) < 1000:
AttributeError: Can't pickle local object 'BoltPool.open.<locals>.opener
==========================================================================
# Name
Version
Build
Channel
dask
2020.12.0
pyhd8ed1ab_0
conda-forge
jupyterlab
3.0.3
pyhd8ed1ab_0
conda-forge
neo4j-python-driver
4.2.1
pyh7fcb38b_0
conda-forge
python
3.9.1
hdb3f193_2
You are getting this error because you are trying to share the driver object amongst your worker.
The driver object contains private data about the connection, data that do not make sense outside the process (and also are not serializable).
It is like trying to open a file somewhere and share the file descriptor somewhere else.
It won't work because the file number makes sense only within the process that generates it.
If you want your workers to access the database or any other network resource, you should give them the directions to connect to the resource.
In your case, you should not pass the global_driver as a parameter but rather the connection parameters and let each worker call get_driver to get its own driver.
I would like to build many outputs based on the same input, e.g. a hex and a binary from an elf.
I will do this multiple times, different places in the wscript so I'd like to wrap it in a feature.
Ideally something like:
bld(features="hex", source="output.elf")
bld(features="bin", source="output.elf")
How would I go about implementing this?
If your elf files always have the same extension, you can simply use that:
# untested, naive code
from waflib import TaskGen
#TaskGen.extension('.elf')
def process_elf(self, node): # <- self = task gen, node is the current input node
if "bin" in self.features:
bin_node = node.change_ext('.bin')
self.create_task('make_bin_task', node, bin_node)
if "hex" in self.features:
hex_node = node.change_ext('.hex')
self.create_task('make_hex_task', node, hex_node)
If not, you have to define the features you want like that:
from waflib import TaskGen
#Taskgen.feature("hex", "bin") # <- attach method features hex AND bin
#TaskGen.before('process_source')
def transform_source(self): # <- here self = task generator
self.inputs = self.to_nodes(getattr(self, 'source', []))
self.meths.remove('process_source') # <- to disable the standard process_source
#Taskgen.feature("hex") # <- attach method to feature hex
#TaskGen.after('transform_source')
def process_hex(self):
for i in self.inputs:
self.create_task("make_hex_task", i, i.change_ext(".hex"))
#Taskgen.feature("bin") # <- attach method to feature bin
#TaskGen.after('transform_source')
def process_hex(self):
for i in self.inputs:
self.create_task("make_bin_task", i, i.change_ext(".bin"))
You have to write the two tasks make_elf_task and make_bin_task. You should put all this in a separate python file and make a "plugin".
You can also define a "shortcut" to call:
def build(bld):
bld.make_bin(source = "output.elf")
bld.make_hex(source = "output.elf")
bld(features = "hex bin", source = "output.elf") # when both needed in the same place
Like that:
from waflib.Configure import conf
#conf
def make_bin(self, *k, **kw): # <- here self = build context
kw["features"] = "bin" # <- you can add bin to existing features kw
return self(*k, **kw)
#conf
def make_hex(self, *k, **kw):
kw["features"] = "hex"
return self(*k, **kw)
My requirement is that I have written a bash script which monitors telnet on several ip(s) and ports. I have used the CSV which contains the input data and the script will read each row in the CSV and checks if the ip(s) can be telnet.
However I have requirement to jenkinize it, and I am wondering if there a way I can define my parameter in the Jenkins Job with different combination or values
say for example:
PARAM_KEY : VAL_1
PARAM_KEY : VAL_2
PARAM_KEY : VAL_3
and so on thus I can use the PARAM_KEY in the script and the Jenkins job gets executed for all the parameters defined i.e. based on the number of PARAMETERS defined i.e. 3 in above case.
Can any one guide me on this requirement.
If you mean to run 1 job and iterate over the ips inside, you can parse the CSV file inside a pipeline or pass it as a parameter ( and then split it )
// example of pipeline code
node ('slave80') {
csvString = "1.1.1.1,2.2.2.2,3.3.3.3" // can be sent as parameter
def ips = csvString.split(',')
ips.each { ip ->
sh """
./bash_script ${ip}
"""
}
}
I would like to download large scientific abstract data for lets say about 2000 Pubmed IDs. My python code is sloppy and seems rather slow working. Is there any fast and efficient method to do harvest these abstracts?
If this is the fastest method how do I measure it so I become able compare against others or home against work situation (different ISP may play part in speed)?
Attached my code below.
import sqlite3
from Bio.Entrez import read,efetch,email,tool
from metapub import PubMedFetcher
import pandas as pd
import requests
from datetime import date
import xml.etree.ElementTree as ET
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
Abstract_data = pd.DataFrame(columns=["name","pmid","abstract"])
def abstract_download(self,dict_pmids):
"""
This method returns abstract for a given pmid and add to the abstract data
"""
index=0
baseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
for names in dict_pmids:
for pmid in dict_pmids[names]:
try:
abstract = []
url = baseUrl+"efetch.fcgi?db=pubmed&id="+pmid+"&rettype=xml"+
response=requests.request("GET",url,timeout=500).text
response=response.encode('utf-8')
root=ET.fromstring(response)
root_find=root.findall('./PubmedArticle/MedlineCitation/Article/Abstract/')
if len(root_find)==0:
root_find=root.findall('./PubmedArticle/MedlineCitation/Article/ArticleTitle')
for i in range(len(root_find)):
if root_find[i].text != None:
abstract.append(root_find[i].text)
if abstract is not None:
Abstract_data.loc[index]=names,pmid,"".join(abstract)
index+=1
except:
print "Connection Refused"
time.sleep(5)
continue
return Abstract_data
EDIT: The general error that occurs for this script is seemingly a "Connection Refused". See the answer of ZF007 below how this was solved.
The below code works. Your script hang on malformed URL construction. Also if things went wrong inside the script the response was a refused connection. This was infact not the case because it was the code that did the processing of the retrieved data.. I've made some adjustments to get the code working for me and left comments in place where you need to adjust due to the lack of the dict_pmids list.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, time, requests, sqlite3
import pandas as pd
import xml.etree.ElementTree as ET
from metapub import PubMedFetcher
from datetime import date
from Bio.Entrez import read,efetch,email,tool
def abstract_download(pmids):
"""
This method returns abstract for a given pmid and add to the abstract data
"""
index = 0
baseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
collected_abstract = []
# code below diabled to get general abstract extraction from pubmed working. I don't have the dict_pmid list.
"""
for names in dict_pmids:
for pmid in dict_pmids[names]:
move below working code to the right to get it in place with above two requirements prior to providing dict_pmid list.
# from here code works upto the next comment. I don't have the dict_pmid list.
"""
for pmid in pmids:
print 'pmid : %s\n' % pmid
abstract = []
root = ''
try:
url = '%sefetch.fcgi?db=pubmed&id=%s&rettype=xml' % (baseUrl, pmid)
# checks my url... line to parse into a webbrowser like firefox.
print 'url', url
response = requests.request("GET", url, timeout=500).text
# check if I got a response.
print 'response', response
# response = response.encode('utf-8')
root = ET.fromstring(response)
except Exception as inst:
# besides a refused connection.... the "why" it was connected comes in handly to resolve issues at hand
# if and when they happen.
print "Connection Refused", inst
time.sleep(5)
continue
root_find = root.findall('./PubmedArticle/MedlineCitation/Article/Abstract/')
if len(root_find)==0:
root_find = root.findall('./PubmedArticle/MedlineCitation/Article/ArticleTitle')
# check if I found something
print 'root_find : %s\n\n' % root_find
for i in range(len(root_find)):
if root_find[i].text != None:
abstract.append(root_find[i].text)
Abstract_data = pd.DataFrame(columns=["name","pmid","abstract"])
# check if I found something
#print 'abstract : %s\n' % abstract
# code works up to the print statement ''abstract', abstract' teh rest is disabled because I don't have the dict_pmid list.
if abstract is not None:
# Abstract_data.loc[index] = names,pmid,"".join(abstract)
index += 1
collected_abstract.append(abstract)
# change back return Abstract_data when dict_pmid list is administered.
# return Abstract_data
return collected_abstract
if __name__ == '__main__':
sys.stdout.flush()
reload(sys)
sys.setdefaultencoding('utf8')
pubmedIDs = range(21491000, 21491001)
mydata = abstract_download(pubmedIDs)
print 'mydata : %s' % (mydata)
I use the following code to temporarily modify environment variables.
#contextmanager
def _setenv(**mapping):
"""``with`` context to temporarily modify the environment variables"""
backup_values = {}
backup_remove = set()
for key, value in mapping.items():
if key in os.environ:
backup_values[key] = os.environ[key]
else:
backup_remove.add(key)
os.environ[key] = value
try:
yield
finally:
# restore old environment
for k, v in backup_values.items():
os.environ[k] = v
for k in backup_remove:
del os.environ[k]
This with context is mainly used in test cases. For example,
def test_myapp_respects_this_envvar():
with _setenv(MYAPP_PLUGINS_DIR='testsandbox/plugins'):
myapp.plugins.register()
[...]
My question: is there a simple/elegant way to write _setenv? I thought about actually doing backup = os.environ.copy() and then os.environ = backup .. but I am not sure if that would affect the program behavior (eg: if os.environ is referenced elsewhere in the Python interpreter).
I suggest you the following implementation:
import contextlib
import os
#contextlib.contextmanager
def set_env(**environ):
"""
Temporarily set the process environment variables.
>>> with set_env(PLUGINS_DIR=u'test/plugins'):
... "PLUGINS_DIR" in os.environ
True
>>> "PLUGINS_DIR" in os.environ
False
:type environ: dict[str, unicode]
:param environ: Environment variables to set
"""
old_environ = dict(os.environ)
os.environ.update(environ)
try:
yield
finally:
os.environ.clear()
os.environ.update(old_environ)
EDIT: more advanced implementation
The context manager below can be used to add/remove/update your environment variables:
import contextlib
import os
#contextlib.contextmanager
def modified_environ(*remove, **update):
"""
Temporarily updates the ``os.environ`` dictionary in-place.
The ``os.environ`` dictionary is updated in-place so that the modification
is sure to work in all situations.
:param remove: Environment variables to remove.
:param update: Dictionary of environment variables and values to add/update.
"""
env = os.environ
update = update or {}
remove = remove or []
# List of environment variables being updated or removed.
stomped = (set(update.keys()) | set(remove)) & set(env.keys())
# Environment variables and values to restore on exit.
update_after = {k: env[k] for k in stomped}
# Environment variables and values to remove on exit.
remove_after = frozenset(k for k in update if k not in env)
try:
env.update(update)
[env.pop(k, None) for k in remove]
yield
finally:
env.update(update_after)
[env.pop(k) for k in remove_after]
Usage examples:
>>> with modified_environ('HOME', LD_LIBRARY_PATH='/my/path/to/lib'):
... home = os.environ.get('HOME')
... path = os.environ.get("LD_LIBRARY_PATH")
>>> home is None
True
>>> path
'/my/path/to/lib'
>>> home = os.environ.get('HOME')
>>> path = os.environ.get("LD_LIBRARY_PATH")
>>> home is None
False
>>> path is None
True
EDIT2
A demonstration of this context manager is available on GitHub.
_environ = dict(os.environ) # or os.environ.copy()
try:
...
finally:
os.environ.clear()
os.environ.update(_environ)
I was looking to do the same thing but for unit testing, here is how I have done it using the unittest.mock.patch function:
def test_function_with_different_env_variable():
with mock.patch.dict('os.environ', {'hello': 'world'}, clear=True):
self.assertEqual(os.environ.get('hello'), 'world')
self.assertEqual(len(os.environ), 1)
Basically using unittest.mock.patch.dict with clear=True, we are making os.environ as a dictionary containing solely {'hello': 'world'}.
Removing the clear=True will let the original os.environ and add/replace the specified key/value pair inside {'hello': 'world'}.
Removing {'hello': 'world'} will just create an empty dictionary, os.envrion will thus be empty within the with.
In pytest you can temporarily set an environment variable using the monkeypatch fixture. See the docs for details. I've copied a snippet here for your convenience.
import os
import pytest
from typing import Any, NewType
# Alias for the ``type`` of monkeypatch fixture.
MonkeyPatchFixture = NewType("MonkeyPatchFixture", Any)
# This is the function we will test below to demonstrate the ``monkeypatch`` fixture.
def get_lowercase_env_var(env_var_name: str) -> str:
"""
Return the value of an environment variable. Variable value is made all lowercase.
:param env_var_name:
The name of the environment variable to return.
:return:
The value of the environment variable, with all letters in lowercase.
"""
env_variable_value = os.environ[env_var_name]
lowercase_env_variable = env_variable_value.lower()
return lowercase_env_variable
def test_get_lowercase_env_var(monkeypatch: MonkeyPatchFixture) -> None:
"""
Test that the function under test indeed returns the lowercase-ified
form of ENV_VAR_UNDER_TEST.
"""
name_of_env_var_under_test = "ENV_VAR_UNDER_TEST"
env_var_value_under_test = "EnvVarValue"
expected_result = "envvarvalue"
# KeyError because``ENV_VAR_UNDER_TEST`` was looked up in the os.environ dictionary before its value was set by ``monkeypatch``.
with pytest.raises(KeyError):
assert get_lowercase_env_var(name_of_env_var_under_test) == expected_result
# Temporarily set the environment variable's value.
monkeypatch.setenv(name_of_env_var_under_test, env_var_value_under_test)
assert get_lowercase_env_var(name_of_env_var_under_test) == expected_result
def test_get_lowercase_env_var_fails(monkeypatch: MonkeyPatchFixture) -> None:
"""
This demonstrates that ENV_VAR_UNDER_TEST is reset in every test function.
"""
env_var_name_under_test = "ENV_VAR_UNDER_TEST"
expected_result = "envvarvalue"
with pytest.raises(KeyError):
assert get_lowercase_env_var(env_var_name_under_test) == expected_result
For unit testing I prefer using a decorator function with optional parameters. This way I can use the modified environment values for a whole test function. The decorator below also restores the original environment values in case the function raises an Exception:
import os
def patch_environ(new_environ=None, clear_orig=False):
if not new_environ:
new_environ = dict()
def actual_decorator(func):
from functools import wraps
#wraps(func)
def wrapper(*args, **kwargs):
original_env = dict(os.environ)
if clear_orig:
os.environ.clear()
os.environ.update(new_environ)
try:
result = func(*args, **kwargs)
except:
raise
finally: # restore even if Exception was raised
os.environ = original_env
return result
return wrapper
return actual_decorator
Usage in unit tests:
class Something:
#staticmethod
def print_home():
home = os.environ.get('HOME', 'unknown')
print("HOME = {0}".format(home))
class SomethingTest(unittest.TestCase):
#patch_environ({'HOME': '/tmp/test'})
def test_environ_based_something(self):
Something.print_home() # prints: HOME = /tmp/test
unittest.main()