Using DASK to read files and write to NEO4J in PYTHON - neo4j

I am having trouble parallelizing code that reads some files and writes to neo4j.
I am using dask to parallelize the process_language_files function (3rd cell from the bottom).
I try to explain the code below, listing out the functions (First 3 cells).
The errors are printed at the end (Last 2 cells).
I am also listing environments and package versions at the end.
If I remove dask.delayed and run this code sequentially, its works perfectly well.
Thank you for your help. :)
==========================================================================
Some functions to work with neo4j.
from neo4j import GraphDatabase
from tqdm import tqdm
def get_driver(uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password=''):
"""Get a neo4j driver."""
connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=uri_scheme, host=host, port=port)
auth = (username, password)
driver = GraphDatabase.driver(connection_uri, auth=auth)
return driver
def format_raw_res(raw_res):
"""Parse neo4j results"""
res = []
for r in raw_res:
res.append(r)
return res
def run_bulk_query(query_list, driver):
"""Run a list of neo4j queries in a session."""
results = []
with driver.session() as session:
for query in tqdm(query_list):
raw_res = session.run(query)
res = format_raw_res(raw_res)
results.append({'query':query, 'result':res})
return results
global_driver = get_driver(uri_scheme='bolt', host='localhost', port='8687', username='neo4j', password='abc123') # neo4j driver object.=
This is how we create a dask client to parallelize.
from dask.distributed import Client
client = Client(threads_per_worker=4, n_workers=1)
The functions that the main code is calling.
import sys
import time
import json
import pandas as pd
import dask
def add_nodes(nodes_list, language_code):
"""Returns a list of strings. Each string is a cypher query to add a node to neo4j."""
list_of_create_strings = []
create_string_template = """CREATE (:LABEL {{node_id:{node_id}}})"""
for index, node in nodes_list.iterrows():
create_string = create_string_template.format(node_id=node['new_id'])
list_of_create_strings.append(create_string)
return list_of_create_strings
def add_relations(relations_list, language_code):
"""Returns a list of strings. Each string is a cypher query to add a relationship to neo4j."""
list_of_create_strings = []
create_string_template = """
MATCH (a),(b) WHERE a.node_id = {source} AND b.node_id = {target}
MERGE (a)-[r:KNOWS {{ relationship_id:{edge_id} }}]-(b)"""
for index, relations in relations_list.iterrows():
create_string = create_string_template.format(
source=relations['from'], target=relations['to'],
edge_id=''+str(relations['from'])+'-'+str(relations['to']))
list_of_create_strings.append(create_string)
return list_of_create_strings
def add_data(language_code, edges, features, targets, driver):
"""Add nodes and relationships to neo4j"""
add_nodes_cypher = add_nodes(targets, language_code) # Returns a list of strings. Each string is a cypher query to add a node to neo4j.
node_results = run_bulk_query(add_nodes_cypher, driver) # Runs each string in the above list in a neo4j session.
add_relations_cypher = add_relations(edges, language_code) # Returns a list of strings. Each string is a cypher query to add a relationship to neo4j.
relations_results = run_bulk_query(add_relations_cypher, driver) # Runs each string in the above list in a neo4j session.
# Saving some metadata
results = {
"nodes": {"results": node_results, "length":len(add_nodes_cypher),},
"relations": {"results": relations_results, "length":len(add_relations_cypher),},
}
return results
def load_data(language_code):
"""Load data from files"""
# Saving file names to variables
edges_filename = './edges.csv'
features_filename = './features.json'
target_filename = './target.csv'
# Loading data from the file names
edges = helper.read_csv(edges_filename)
features = helper.read_json(features_filename)
targets = helper.read_csv(target_filename)
# Saving some metadata
results = {
"edges": {"length":len(edges),},
"features": {"length":len(features),},
"targets": {"length":len(targets),},
}
return edges, features, targets, results
The main code.
def process_language_files(process_language_files, driver):
"""Reads files, creates cypher queries to add nodes and relationships, runs cypher query in a neo4j session."""
edges, features, targets, reading_results = load_data(language_code) # Read files.
writing_results = add_data(language_code, edges, features, targets, driver) # Convert files nodes and relationships and add to neo4j in a neo4j session.
return {"reading_results": reading_results, "writing_results": writing_results} # Return some metadata
# Execution starts here
res=[]
for index, language_code in enumerate(['ENGLISH', 'FRENCH']):
lazy_result = dask.delayed(process_language_files)(language_code, global_driver)
res.append(lazy_result)
Result from res. These are dask delayed objects.
print(*res)
Delayed('process_language_files-a73f4a9d-6ffa-4295-8803-7fe09849c068') Delayed('process_language_files-c88fbd4f-e8c1-40c0-b143-eda41a209862')
The errors. Even if use dask.compute(), I am getting similar errors.
futures = dask.persist(*res)
AttributeError Traceback (most recent call last)
~/Code/miniconda3/envs/MVDS/lib/python3.6/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
48 buffers.clear()
---> 49 result = pickle.dumps(x, **dump_kwargs)
50 if len(result) < 1000:
AttributeError: Can't pickle local object 'BoltPool.open.<locals>.opener
==========================================================================
# Name
Version
Build
Channel
dask
2020.12.0
pyhd8ed1ab_0
conda-forge
jupyterlab
3.0.3
pyhd8ed1ab_0
conda-forge
neo4j-python-driver
4.2.1
pyh7fcb38b_0
conda-forge
python
3.9.1
hdb3f193_2

You are getting this error because you are trying to share the driver object amongst your worker.
The driver object contains private data about the connection, data that do not make sense outside the process (and also are not serializable).
It is like trying to open a file somewhere and share the file descriptor somewhere else.
It won't work because the file number makes sense only within the process that generates it.
If you want your workers to access the database or any other network resource, you should give them the directions to connect to the resource.
In your case, you should not pass the global_driver as a parameter but rather the connection parameters and let each worker call get_driver to get its own driver.

Related

Dask - diagnostics dashboard - custom info about task

I'm using Dask to schedule and run research batches.
Those mostly produce side effects and are quite heavy (ranging from few minutes to a couple of hours). There's no communication between the tasks.
In code it looks like this, first I'm passing all the batches to process:
def process_batches(batches: Iterator[Batch], log_dir: Path):
cluster = LocalCluster(
n_workers=os.cpu_count(),
threads_per_worker=1
)
client = Client(cluster)
futures = []
for batch in batches:
futures += process_batch(batch, client, log_dir)
progress(futures)
Then I'm submitting repetitions from each batch as tasks:
def process_batch(batch: Batch, client: Client, log_dir: Path) -> List[Future]:
batch_dir = log_dir.joinpath(batch.nice_hash)
batch_futures = []
num_workers = len(client.scheduler_info()['workers'])
with Logger(batch_dir, clear_dir=True) as logger:
logger.save_json(batch.as_dict, 'batch')
for repetition in range(batch.n_repeats):
cpu_index = repetition % num_workers
future = client.submit(
process_batch_repetition,
batch,
repetition,
cpu_index,
logger
)
batch_futures.append(future)
return batch_futures
Is there any way to pass some custom info about the submitted task to the dashboard?
All I'm seeing are just tasks process_batch_repetition. Could I replace it with a custom string, so I can see what batch configurations are being processed at the moment?
Got an answer from Dask's BDFL mrocklin.
You can use the key= keyword to specify a key for the future. This should
be unique per future. Dask will use the prefix of the key name to
determine how it is rendered on the dashboard. See the docstring for
dask.utils.key_split for examples on how a key prefix is generated from a
key.
So you can use it like this:
future = client.submit(
process_batch_repetition,
batch,
repetition,
cpu_index,
logger,
key=f'{str(batch)}_repetition_{repetition}'
)
You just pass a unique string for this task. There are some forbidden chars (i.e. spaces), so expect some key errors.

Dask opportunistic caching in custom graphs

I have a custom DAG such as:
dag = {'load': (load, 'myfile.txt'),
'heavy_comp': (heavy_comp, 'load'),
'simple_comp_1': (sc_1, 'heavy_comp'),
'simple_comp_2': (sc_2, 'heavy_comp'),
'simple_comp_3': (sc_3, 'heavy_comp')}
And I'm looking to compute the keys simple_comp_1, simple_comp_2, and simple_comp_3, which I perform as follows,
import dask
from dask.distributed import Client
from dask_yarn import YarnCluster
task_1 = dask.get(dag, 'simple_comp_1')
task_2 = dask.get(dag, 'simple_comp_2')
task_3 = dask.get(dag, 'simple_comp_3')
tasks = [task_1, task_2, task_3]
cluster = YarnCluster()
cluster.scale(3)
client = Client(cluster)
dask.compute(tasks)
cluster.shutdown()
It seems, that without caching, the computation of these 3 keys will lead to the computation of heavy_comp also 3 times. And since this is a heavy computation, I tried to implement opportunistic caching from here as follows:
from dask.cache import Cache
cache = Cache(2e9)
cache.register()
However, when I tried to print the results of what was being cached I got nothing:
>>> cache.cache.data
[]
>>> cache.cache.heap.heap
{}
>>> cache.cache.nbytes
{}
I even tried increasing the cache size to 6GB, however to no effect. Am I doing something wrong? How can I get Dask to cache the result of the key heavy_comp?
Expanding on MRocklin's answer and to format code in the comments below the question.
Computing the entire graph at once works as you would expect it to. heavy_comp would only be executed once, which is what you want. Consider the following code you provided in the comments completed by empty function definitions:
def load(fn):
print('load')
return fn
def sc_1(i):
print('sc_1')
return i
def sc_2(i):
print('sc_2')
return i
def sc_3(i):
print('sc_3')
return i
def heavy_comp(i):
print('heavy_comp')
return i
def merge(*args):
print('merge')
return args
dag = {'load': (load, 'myfile.txt'), 'heavy_comp': (heavy_comp, 'load'), 'simple_comp_1': (sc_1, 'heavy_comp'), 'simple_comp_2': (sc_2, 'heavy_comp'), 'simple_comp_3': (sc_3, 'heavy_comp'), 'merger_comp': (merge, 'sc_1', 'sc_2', 'sc_3')}
import dask
result = dask.get(dag, 'merger_comp')
print('result:', result)
It outputs:
load
heavy_comp
sc_1
sc_2
sc_3
merge
result: ('sc_1', 'sc_2', 'sc_3')
As you can see, "heavy_comp" is only printed once, showing that the function heavy_comp has only been executed once.
The opportunistic cache in the core Dask library only works for the single-machine scheduler, not the distributed scheduler.
However, if you just compute the entire graph at once Dask will hold onto intermediate values intelligently. If there are values that you would like to hold onto regardless you might also look at the persist function.

Retrained inception_v3 model deployed in Cloud ML Engine always outputs the same predictions

I followed the codelab TensorFlow For Poets for transfer learning using inception_v3. It generates retrained_graph.pb and retrained_labels.txt files, which can used to make predictions locally (running label_image.py).
Then, I wanted to deploy this model to Cloud ML Engine, so that I could make online predictions. For that, I had to export the retrained_graph.pb to SavedModel format. I managed to do it by following the indications in this answer from Google's #rhaertel80 and this python file from the Flowers Cloud ML Engine Tutorial. Here is my code:
import tensorflow as tf
from tensorflow.contrib import layers
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import signature_def_utils
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.saved_model import utils as saved_model_utils
export_dir = '../tf_files/saved7'
retrained_graph = '../tf_files/retrained_graph2.pb'
label_count = 5
def build_signature(inputs, outputs):
signature_inputs = { key: saved_model_utils.build_tensor_info(tensor) for key, tensor in inputs.items() }
signature_outputs = { key: saved_model_utils.build_tensor_info(tensor) for key, tensor in outputs.items() }
signature_def = signature_def_utils.build_signature_def(
signature_inputs,
signature_outputs,
signature_constants.PREDICT_METHOD_NAME
)
return signature_def
class GraphReferences(object):
def __init__(self):
self.examples = None
self.train = None
self.global_step = None
self.metric_updates = []
self.metric_values = []
self.keys = None
self.predictions = []
self.input_jpeg = None
class Model(object):
def __init__(self, label_count):
self.label_count = label_count
def build_image_str_tensor(self):
image_str_tensor = tf.placeholder(tf.string, shape=[None])
def decode_and_resize(image_str_tensor):
return image_str_tensor
image = tf.map_fn(
decode_and_resize,
image_str_tensor,
back_prop=False,
dtype=tf.string
)
return image_str_tensor
def build_prediction_graph(self, g):
tensors = GraphReferences()
tensors.examples = tf.placeholder(tf.string, name='input', shape=(None,))
tensors.input_jpeg = self.build_image_str_tensor()
keys_placeholder = tf.placeholder(tf.string, shape=[None])
inputs = {
'key': keys_placeholder,
'image_bytes': tensors.input_jpeg
}
keys = tf.identity(keys_placeholder)
outputs = {
'key': keys,
'prediction': g.get_tensor_by_name('final_result:0')
}
return inputs, outputs
def export(self, output_dir):
with tf.Session(graph=tf.Graph()) as sess:
with tf.gfile.GFile(retrained_graph, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name="")
g = tf.get_default_graph()
inputs, outputs = self.build_prediction_graph(g)
signature_def = build_signature(inputs=inputs, outputs=outputs)
signature_def_map = {
signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def
}
builder = saved_model_builder.SavedModelBuilder(output_dir)
builder.add_meta_graph_and_variables(
sess,
tags=[tag_constants.SERVING],
signature_def_map=signature_def_map
)
builder.save()
model = Model(label_count)
model.export(export_dir)
This code generates a saved_model.pb file, which I then used to create the Cloud ML Engine model. I can get predictions from this model using gcloud ml-engine predict --model my_model_name --json-instances request.json, where the contents of request.json are:
{ "key": "0", "image_bytes": { "b64": "jpeg_image_base64_encoded" } }
However, no matter which jpeg I encode in the request, I always get the exact same wrong predictions:
Prediction output
I guess the problem is in the way the CloudML Prediction API passes the base64 encoded image bytes to the input tensor "DecodeJpeg/contents:0" of inception_v3 ("build_image_str_tensor()" method in the previous code). Any clue on how can I solve this issue and have my locally retrained model serving correct predictions on Cloud ML Engine?
(Just to make it clear, the problem is not in retrained_graph.pb, as it makes correct predictions when I run it locally; nor is it in request.json, because the same request file worked without problems when following the Flowers Cloud ML Engine Tutorial pointed above.)
First, a general warning. The TensorFlow for Poets codelab was not written in a way that is very amenable to production serving (partly manifested by the workarounds you are having to implement). You would normally export a prediction-specific graph that doesn't contain all of the extra training ops. So while we can try and hack something together that works, extra work may be needed to productionize this graph.
The approach of your code appears to be to import one graph, add some placeholders, and then export the result. This is generally fine. However, in the code shown in the question, you are adding input placeholders without actually connecting them to anything in the imported graph. You end up with a graph containing multiple disconnected subgraphs, something like (excuse the crude diagram):
image_str_tensor [input=image_bytes] -> <nothing>
keys_placeholder [input=key] -> identity [output=key]
inception_subgraph -> final_graph [output=prediction]
By inception_subgraph I mean all of the ops that you are importing.
So image_bytes is effectively a no-op and is ignored; key gets passed through; and prediction contains the result of running the inception_subgraph; since it's not using the input you are passing, it's returning the same result everytime (though I admit I actually expected an error here).
To address this problem, we would need to connect the placeholder you've created to the one that already exists in inception_subgraph to create a graph more or less like this:
image_str_tensor [input=image_bytes] -> inception_subgraph -> final_graph [output=prediction]
keys_placeholder [input=key] -> identity [output=key]
Note that image_str_tensor is going to be a batch of images, as required by the prediction service, but the inception graph's input is actually a single image. In the interest of simplicity, we're going to address this in a hacky way: we'll assume we'll be sending images one-by-one. If we ever send more than one image per request, we'll get errors. Also, batch prediction will never work.
The main change you need is the import statement, which connects the placeholder we've added to the existing input in the graph (you'll also see the code for changing the shape of the input):
Putting it all together, we get something like:
import tensorflow as tf
from tensorflow.contrib import layers
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import signature_def_utils
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.saved_model import utils as saved_model_utils
export_dir = '../tf_files/saved7'
retrained_graph = '../tf_files/retrained_graph2.pb'
label_count = 5
class Model(object):
def __init__(self, label_count):
self.label_count = label_count
def build_prediction_graph(self, g):
inputs = {
'key': keys_placeholder,
'image_bytes': tensors.input_jpeg
}
keys = tf.identity(keys_placeholder)
outputs = {
'key': keys,
'prediction': g.get_tensor_by_name('final_result:0')
}
return inputs, outputs
def export(self, output_dir):
with tf.Session(graph=tf.Graph()) as sess:
# This will be our input that accepts a batch of inputs
image_bytes = tf.placeholder(tf.string, name='input', shape=(None,))
# Force it to be a single input; will raise an error if we send a batch.
coerced = tf.squeeze(image_bytes)
# When we import the graph, we'll connect `coerced` to `DecodeJPGInput:0`
input_map = {'DecodeJPGInput:0': coerced}
with tf.gfile.GFile(retrained_graph, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, input_map=input_map, name="")
keys_placeholder = tf.placeholder(tf.string, shape=[None])
inputs = {'image_bytes': image_bytes, 'key': keys_placeholder}
keys = tf.identity(keys_placeholder)
outputs = {
'key': keys,
'prediction': tf.get_default_graph().get_tensor_by_name('final_result:0')}
}
tf.simple_save(sess, output_dir, inputs, outputs)
model = Model(label_count)
model.export(export_dir)
I believe that your error is quite simple to solve:
{ "key": "0", "image_bytes": { "b64": "jpeg_image_base64_encoded" } }
You used " to specify what, I believe, is a string. By doing that, your program is reading jpeg_image_base64_encoded instead of the actual value of the variable.
That's why you get always the same prediction.
For anyone working on deploying TensorFlow image-based models on Google Cloud ML, in particular trying to get the base64 encoding working for images (as discussed in this question), I'd recommend also having a look at the following repo that I put together. I spent a lot of time working through the deployment process and was only able to find partial information across the web and on stack overflow. This repo has a full working version of deploying a TensorFlow tf.keras model onto google cloud ML and I think it will be of help to people who are facing the same challenges I faced. Here's the github link:
https://github.com/mhwilder/tf-keras-gcloud-deployment.
The repo covers the following topics:
Training a fully convolutional tf.keras model locally (mostly just to have a model for testing the next parts)
Example code for exporting models that work with the Cloud ML Engine
Three model versions that accept different JSON input types (1. An image converted to a simple list string, 2. An image converted to a base64 encoded string, and 3. A URL that points to an image in a Google Storage bucket)
Instructions and references for general Google Cloud Platform setup
Code for preparing the input JSON files for the 3 different input types
Google Cloud ML model and version creation instructions from the console
Examples using the Google Cloud SDK to call predict on the models

How to fetch records set with a ttl of -1 in aerospike?

I have so many records in aerospike, i want to fetch the records whose ttl is -1 please provide solution
Just to clarify, setting a TTL of -1 in the client means never expire (equivalent to a default-ttl of 0 in the server's aerospike.conf file), while setting a TTL of 0 in the client means inherit the default-ttl for this namespace.
With Predicate Filtering:
If you're using the Java, C, C# and Go clients the easiest way to identify the records with a void time of 0 would be to use a predicate filter.
In the Java app:
Statement stmt = new Statement();
stmt.setNamespace(params.namespace);
stmt.setSetName(params.set);
stmt.setPredExp(
PredExp.recVoidTime(),
PredExp.integerValue(0),
PredExp.integerEqual()
);
RecordSet rs = client.query(null, stmt);
Without Predicate Filtering:
With other clients that don't yet have predicate filtering (Python, PHP, etc), you would do it all through a stream UDF. The filtering logic would have to live inside the UDF.
ttl.lua
local function filter_ttl_zero(rec)
local rec_ttl = record.ttl(rec)
if rec_ttl == 0 then
return true
end
return false
end
local function map_record(rec)
local ret = map()
for i, bin_name in ipairs(record.bin_names(rec)) do
ret[bin_name] = rec[bin_name]
end
return ret
end
function get_zero_ttl_recs(stream)
return stream : filter(filter_ttl_zero) : map(map_record)
end
In AQL:
$ aql
Aerospike Query Client
Version 3.12.0
C Client Version 4.1.4
Copyright 2012-2017 Aerospike. All rights reserved.
aql> register module './ttl.lua'
OK, 1 module added.
aql> AGGREGATE ttl.get_zero_ttl_recs() on test.foo
Alternatively, you could run the stream UDF from the client. The following example is for the Python client:
import aerospike
import pprint
config = {'hosts': [('127.0.0.1', 3000)],
'lua': {'system_path':'/usr/local/aerospike/lua/',
'user_path':'/usr/local/aerospike/usr-lua/'}}
client = aerospike.client(config).connect()
pp = pprint.PrettyPrinter(indent=2)
query = client.query('test', 'foo')
query.apply('ttl', 'get_zero_ttl_recs')
records = query.results()
# we expect a dict (map) whose keys are bin names
# each with the associated bin value
pp.pprint(records)
client.close()

Neo4J: Automatic Indexing on Batch Execution

Is it possible to import data on Neo4J using the automatic indexing feature? I'm trying to import data using BatchInserter and BatchInserterIndex like the following example:
BatchInserter inserter = BatchInserters.inserter("/home/fmagalhaes/Neo4JDatabase");
BatchInserterIndexProvider indexProvider = new LuceneBatchInserterIndexProvider(inserter);
BatchInserterIndex nodeIndex = indexProvider.nodeIndex("node_auto_index", MapUtil.stringMap("type","exact"));
BatchInserterIndex relIndex = indexProvider.relationshipIndex("relationship_auto_index", MapUtil.stringMap("type","exact"));
...
inserter.createNode(vertexId, properties);
nodeIndex.add(vertexId, properties);
...
The problem is that when batch processing is completed, I'm trying to open this database with Blueprints generic API by doing the following:
Graph g = new Neo4jGraph("/home/fmagalhaes/Neo4JDatabase");
Set<String> nodeIndices = ((KeyIndexableGraph)g).getIndexedKeys(Vertex.class);
Set<String> relIndices = ((KeyIndexableGraph)g).getIndexedKeys(Edge.class);
and both nodeIndices and relIndices are empty. Auto indexing feature is disabled when I open the graph database on Blueprints API. Is it possible to create an automatic index during the batch processing such that this index will be visible (and will continue to index data automatically as properties are added to vertices and edges) when I open the database with Blueprints API?
you have to cleanly shut down both the batch-index as well as the batch inserter
you probably don't want to index all properties, just the key ones that you use to look up nodes
you have to enable auto-indexing in the neo4j config for the database you start afterwards, and for the same properties that you indexed during batch-insertion

Resources