neo4j query running slowly in Python for Windows - neo4j

I'm developing locally on a Windows 10 machine using Python 2.7. I'm using Neo4j 3.0.5 with the Bolt driver for Python.
Connection string is as follows:
db = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("USERNAME", "PASSWORD"), encrypted=False)
When running queries I'm using the following syntax:
with db.session() as s:
with s.begin_transaction() as tx:
results = tx.run(
"MATCH (a:User{username:{username}}) RETURN a.username",
{
"username": username
})
For some reason there is about a 2 second latency.
I ran the following tests (with slightly different syntax, but exactly the same result) and the slow elements seem to be loading the database driver and then the FIRST run of the session.
from neo4j.v1 import GraphDatabase
import timeit
start_time = 0
elapsed = 0
task = ""
def startTimer(taskIn):
global task
global start_time
task = taskIn
start_time = timeit.default_timer()
def endTimer():
global task
global start_time
global elapsed
elapsed = round(timeit.default_timer() - start_time, 3)
print(task, elapsed)
startTimer("GraphDatabase.driver")
db = GraphDatabase.driver("bolt://localhost:7687", auth=("USERNAME", "PASSWORD"))
endTimer()
startTimer("db.session")
session = db.session()
endTimer()
startTimer("query1")
result = session.run(
"MATCH (a:User{username:{username}}) RETURN a.username, a.password_hash ",
{"username": "Pingu"})
endTimer()
startTimer("query2")
result = session.run(
"MATCH (a:User{username:{username}}) RETURN a.username, a.password_hash ",
{"username": "Pingu"})
endTimer()
startTimer("db.close")
session.close()
endTimer()
The results were as follow:
('GraphDatabase.driver', 1.308)
('db.session', 0.0)
('query1', 1.017)
('query2', 0.001)
('db.close', 0.009)
The string is the test step and the number is the number of seconds for execution.
I'm developing a Flask API and so I can get past the database driver load time by loading it once and then referencing the loaded instance.
However, I can't seem to get past the query1 issue.
Running the exact same code on an Ubuntu Server Virtual box runs like lightening, so this seems to be something to do with Windows implementation.
Any ideas how this can be resolved please?
Thank you very much!

Related

Using DASK to read files and write to NEO4J in PYTHON

I am having trouble parallelizing code that reads some files and writes to neo4j.
I am using dask to parallelize the process_language_files function (3rd cell from the bottom).
I try to explain the code below, listing out the functions (First 3 cells).
The errors are printed at the end (Last 2 cells).
I am also listing environments and package versions at the end.
If I remove dask.delayed and run this code sequentially, its works perfectly well.
Thank you for your help. :)
==========================================================================
Some functions to work with neo4j.
from neo4j import GraphDatabase
from tqdm import tqdm
def get_driver(uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password=''):
"""Get a neo4j driver."""
connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=uri_scheme, host=host, port=port)
auth = (username, password)
driver = GraphDatabase.driver(connection_uri, auth=auth)
return driver
def format_raw_res(raw_res):
"""Parse neo4j results"""
res = []
for r in raw_res:
res.append(r)
return res
def run_bulk_query(query_list, driver):
"""Run a list of neo4j queries in a session."""
results = []
with driver.session() as session:
for query in tqdm(query_list):
raw_res = session.run(query)
res = format_raw_res(raw_res)
results.append({'query':query, 'result':res})
return results
global_driver = get_driver(uri_scheme='bolt', host='localhost', port='8687', username='neo4j', password='abc123') # neo4j driver object.=
This is how we create a dask client to parallelize.
from dask.distributed import Client
client = Client(threads_per_worker=4, n_workers=1)
The functions that the main code is calling.
import sys
import time
import json
import pandas as pd
import dask
def add_nodes(nodes_list, language_code):
"""Returns a list of strings. Each string is a cypher query to add a node to neo4j."""
list_of_create_strings = []
create_string_template = """CREATE (:LABEL {{node_id:{node_id}}})"""
for index, node in nodes_list.iterrows():
create_string = create_string_template.format(node_id=node['new_id'])
list_of_create_strings.append(create_string)
return list_of_create_strings
def add_relations(relations_list, language_code):
"""Returns a list of strings. Each string is a cypher query to add a relationship to neo4j."""
list_of_create_strings = []
create_string_template = """
MATCH (a),(b) WHERE a.node_id = {source} AND b.node_id = {target}
MERGE (a)-[r:KNOWS {{ relationship_id:{edge_id} }}]-(b)"""
for index, relations in relations_list.iterrows():
create_string = create_string_template.format(
source=relations['from'], target=relations['to'],
edge_id=''+str(relations['from'])+'-'+str(relations['to']))
list_of_create_strings.append(create_string)
return list_of_create_strings
def add_data(language_code, edges, features, targets, driver):
"""Add nodes and relationships to neo4j"""
add_nodes_cypher = add_nodes(targets, language_code) # Returns a list of strings. Each string is a cypher query to add a node to neo4j.
node_results = run_bulk_query(add_nodes_cypher, driver) # Runs each string in the above list in a neo4j session.
add_relations_cypher = add_relations(edges, language_code) # Returns a list of strings. Each string is a cypher query to add a relationship to neo4j.
relations_results = run_bulk_query(add_relations_cypher, driver) # Runs each string in the above list in a neo4j session.
# Saving some metadata
results = {
"nodes": {"results": node_results, "length":len(add_nodes_cypher),},
"relations": {"results": relations_results, "length":len(add_relations_cypher),},
}
return results
def load_data(language_code):
"""Load data from files"""
# Saving file names to variables
edges_filename = './edges.csv'
features_filename = './features.json'
target_filename = './target.csv'
# Loading data from the file names
edges = helper.read_csv(edges_filename)
features = helper.read_json(features_filename)
targets = helper.read_csv(target_filename)
# Saving some metadata
results = {
"edges": {"length":len(edges),},
"features": {"length":len(features),},
"targets": {"length":len(targets),},
}
return edges, features, targets, results
The main code.
def process_language_files(process_language_files, driver):
"""Reads files, creates cypher queries to add nodes and relationships, runs cypher query in a neo4j session."""
edges, features, targets, reading_results = load_data(language_code) # Read files.
writing_results = add_data(language_code, edges, features, targets, driver) # Convert files nodes and relationships and add to neo4j in a neo4j session.
return {"reading_results": reading_results, "writing_results": writing_results} # Return some metadata
# Execution starts here
res=[]
for index, language_code in enumerate(['ENGLISH', 'FRENCH']):
lazy_result = dask.delayed(process_language_files)(language_code, global_driver)
res.append(lazy_result)
Result from res. These are dask delayed objects.
print(*res)
Delayed('process_language_files-a73f4a9d-6ffa-4295-8803-7fe09849c068') Delayed('process_language_files-c88fbd4f-e8c1-40c0-b143-eda41a209862')
The errors. Even if use dask.compute(), I am getting similar errors.
futures = dask.persist(*res)
AttributeError Traceback (most recent call last)
~/Code/miniconda3/envs/MVDS/lib/python3.6/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
48 buffers.clear()
---> 49 result = pickle.dumps(x, **dump_kwargs)
50 if len(result) < 1000:
AttributeError: Can't pickle local object 'BoltPool.open.<locals>.opener
==========================================================================
# Name
Version
Build
Channel
dask
2020.12.0
pyhd8ed1ab_0
conda-forge
jupyterlab
3.0.3
pyhd8ed1ab_0
conda-forge
neo4j-python-driver
4.2.1
pyh7fcb38b_0
conda-forge
python
3.9.1
hdb3f193_2
You are getting this error because you are trying to share the driver object amongst your worker.
The driver object contains private data about the connection, data that do not make sense outside the process (and also are not serializable).
It is like trying to open a file somewhere and share the file descriptor somewhere else.
It won't work because the file number makes sense only within the process that generates it.
If you want your workers to access the database or any other network resource, you should give them the directions to connect to the resource.
In your case, you should not pass the global_driver as a parameter but rather the connection parameters and let each worker call get_driver to get its own driver.

Increasing workers causes Dataflow job to hang on TextIO.Write - executes quickly using DirectRunner - Apache Beam

This program ingests records from a file, parses and saves the records to the database, and writes failure records to a Cloud Storage bucket. The test file I'm using only creates 3 failure records - when run locally the final step parseResults.get(failedRecords).apply("WriteFailedRecordsToGCS", TextIO.write().to(failureRecordsPath)); executes in milliseconds.
In Dataflow I am running the process with 5 workers. The process hangs indefinitely on the write step even after writing the 3 failure records successfully. I can see that it is hanging in the step WriteFailedRecordsToGCS/WriteFiles/FinalizeTempFileBundles/Reshuffle.ViaRandomKey/Pair with random key.out0
Can anyone let me know why this behaves so differently between DirectRunner and Dataflow? The whole pipeline is below.
StageUtilizationDataSourceOptions options = PipelineOptionsFactory.fromArgs(args).as(StageUtilizationDataSourceOptions.class);
final TupleTag<Utilization> parsedRecords = new TupleTag<Utilization>("parsedRecords") {};
final TupleTag<String> failedRecords = new TupleTag<String>("failedRecords") {};
DrgAnalysisDbStage drgAnalysisDbStage = new DrgAnalysisDbStage(options);
HashMap<String, Client> clientKeyMap = drgAnalysisDbStage.getClientKeys();
Pipeline pipeline = Pipeline.create(options);
PCollectionTuple parseResults = PCollectionTuple.empty(pipeline);
PCollection<String> records = pipeline.apply("ReadFromGCS", TextIO.read().from(options.getGcsFilePath()));
if (FileTypes.utilization.equalsIgnoreCase(options.getFileType())) {
parseResults = records
.apply("ConvertToUtilizationRecord", ParDo.of(new ParseUtilizationFile(parsedRecords, failedRecords, clientKeyMap, options.getGcsFilePath()))
.withOutputTags(parsedRecords, TupleTagList.of(failedRecords)));
parseResults.get(parsedRecords).apply("WriteToUtilizationStagingTable", drgAnalysisDbStage.writeUtilizationRecordsToStagingTable());
} else {
logger.error("Unrecognized file type provided: " + options.getFileType());
}
String failureRecordsPath = Utilities.getFailureRecordsPath(options.getGcsFilePath(), options.getFileType());
parseResults.get(failedRecords).apply("WriteFailedRecordsToGCS", TextIO.write().to(failureRecordsPath));
pipeline.run().waitUntilFinish();

Parallel SQL queries

How does one run SQL queries with different column dimensions in parallel using dask? Below was my attempt:
from dask.delayed import delayed
from dask.diagnostics import ProgressBar
import dask
ProgressBar().register()
con = cx_Oracle.connect(user="BLAH",password="BLAH",dsn = "BLAH")
#delayed
def loadsql(sql):
return pd.read_sql_query(sql,con)
results = [loadsql(x) for x in sql_to_run]
dask.compute(results)
df1=results[0]
df2=results[1]
df3=results[2]
df4=results[3]
df5=results[4]
df6=results[5]
However this results in the following error being thrown:
DatabaseError: Execution failed on sql: "SQL QUERY"
ORA-01013: user requested cancel of current operation
unable to rollback
and then shortly thereafter another error comes up:
MultipleInstanceError: Multiple incompatible subclass instances of TerminalInteractiveShell are being created.
sql_to_run is a list of different sql queries
Any suggestions or pointers?? Thanks!
Update 9.7.18
Think this is more a case of me not reading documentation close enough. Indeed the con being outside the loadsql function was causing the problem. The below is the code change that seems to be working as intended now.
def loadsql(sql):
con = cx_Oracle.connect(user="BLAH",password="BLAH",dsn = "BLAH")
result = pd.read_sql_query(sql,con)
con.close()
return result
values = [delayed(loadsql)(x) for x in sql_to_run]
#MultiProcessing version
import dask.multiprocessing
results = dask.compute(*values, scheduler='processes')
#My sample queries took 56.2 seconds
#MultiThreaded version
import dask.threaded
results = dask.compute(*values, scheduler='threads')
#My sample queries took 51.5 seconds
My guess is, that the oracle client is not thread-safe. You could try running with processes instead (by using the multiprocessing scheduler, or the distributed one), if the conn object serialises - this may be unlikely. More likely to work, would be to create the connection within loadsql, so it gets remade for each call, and the different connections hopefully don't interfere with one-another.

How to assert count of the record in couchbase in Gatling Simulation

I am trying to capture the record which has been created on running of gatling simulation test.
My scenario is that
read json data from csv and publish to kafka which is consumed by microservice and store data into couchbase,
since kafka publishing message in asyn mode so there is no way we can get to know that how many record got created in data base.
is there any way in gatling to get data from couchabse and assert so that if record in couchbase not equal to the request then simulation should fail ?
val scn = scenario("Order test sceanrio")
.feed(csv("TestOrder.csv").circular)
.exec(ProducerBuilder[Array[Byte], Array[Byte]]())
setUp(scn.inject(atOnceUsers(count))).protocols(kafkaProtocol)
//.assertion(getCouchbaseOrderCount == count) // not supported by
gatling
I Have resolved this issue by using tearDown in the simulation.
Below is the tearDown code for gatling,
after {
println("**************** Asserting scenario *****************")
assert(orderCount() == count)
}
def orderCount(): Int = {
val cluster = openConnection
val bucket: Bucket = cluster.openBucket("Order", "password");
println(bucket)
val query = "SELECT meta().id FROM `Order`"
Thread.sleep(1000)
val orderCount: Int = bucket.query(N1qlQuery.simple(query)).allRows().size();
println(" Order Count :: " + orderCount)
orderCount
}

Searchable index gets locked on manual update (LockObtainFailedException)

We have a Grails project that runs behind a load balancer. There are three instances of the Grails application running on the server (using separate Tomcat instances). Each instance has its own searchable index. Because the indexes are separate, the automatic update is not enough keeping the index consistent between the application instances. Because of this we have disabled the searchable index mirroring and updates to the index are done manually in a scheduled quartz job. According to our understanding no other part of the application should modify the index.
The quartz job runs once a minute and it checks from the database which rows have been updated by the application, and re-indexes those objects. The job also checks if the same job is already running so it doesn’t do any concurrent indexing. The application runs fine for few hours after the startup and then suddenly when the job is starting, LockObtainFailedException is thrown:
22.10.2012 11:20:40 [xxxx.ReindexJob] ERROR Could not update searchable index, class org.compass.core.engine.SearchEngineException:
Failed to open writer for sub index [product]; nested exception is
org.apache.lucene.store.LockObtainFailedException: Lock obtain timed
out:
SimpleFSLock#/home/xxx/tomcat/searchable-index/index/product/lucene-a7bbc72a49512284f5ac54f5d7d32849-write.lock
According to the log the last time the job was executed, re-indexing was done without any errors and the job finished successfully. Still, this time the re-index operation throws the locking exception, as if the previous operation was unfinished and the lock had not been released. The lock will not be released until the application is restarted.
We tried to solve the problem by manually opening the locked index, which causes the following error to be printed to the log:
22.10.2012 11:21:30 [manager.IndexWritersManager ] ERROR Illegal state, marking an index writer as open, while another is marked as
open for sub index [product]
After this the job seems to be working correctly and doesn’t become stuck in a locked state again. However this causes the application to constantly use 100 % of the CPU resource. Below is a shortened version of the quartz job code.
Any help would be appreciated to solve the problem, thanks in advance.
class ReindexJob {
def compass
...
static Calendar lastIndexed
static triggers = {
// Every day every minute (at xx:xx:30), start delay 2 min
// cronExpression: "s m h D M W [Y]"
cron name: "ReindexTrigger", cronExpression: "30 * * * * ?", startDelay: 120000
}
def execute() {
if (ConcurrencyHelper.isLocked(ConcurrencyHelper.Locks.LUCENE_INDEX)) {
log.error("Search index has been locked, not doing anything.")
return
}
try {
boolean acquiredLock = ConcurrencyHelper.lock(ConcurrencyHelper.Locks.LUCENE_INDEX, "ReindexJob")
if (!acquiredLock) {
log.warn("Could not lock search index, not doing anything.")
return
}
Calendar reindexDate = lastIndexed
Calendar newReindexDate = Calendar.instance
if (!reindexDate) {
reindexDate = Calendar.instance
reindexDate.add(Calendar.MINUTE, -3)
lastIndexed = reindexDate
}
log.debug("+++ Starting ReindexJob, last indexed ${TextHelper.formatDate("yyyy-MM-dd HH:mm:ss", reindexDate.time)} +++")
Long start = System.currentTimeMillis()
String reindexMessage = ""
// Retrieve the ids of products that have been modified since the job last ran
String productQuery = "select p.id from Product ..."
List<Long> productIds = Product.executeQuery(productQuery, ["lastIndexedDate": reindexDate.time, "lastIndexedCalendar": reindexDate])
if (productIds) {
reindexMessage += "Found ${productIds.size()} product(s) to reindex. "
final int BATCH_SIZE = 10
Long time = TimeHelper.timer {
for (int inserted = 0; inserted < productIds.size(); inserted += BATCH_SIZE) {
log.debug("Indexing from ${inserted + 1} to ${Math.min(inserted + BATCH_SIZE, productIds.size())}: ${productIds.subList(inserted, Math.min(inserted + BATCH_SIZE, productIds.size()))}")
Product.reindex(productIds.subList(inserted, Math.min(inserted + BATCH_SIZE, productIds.size())))
Thread.sleep(250)
}
}
reindexMessage += " (${time / 1000} s). "
} else {
reindexMessage += "No products to reindex. "
}
log.debug(reindexMessage)
// Re-index brands
Brand.reindex()
lastIndexed = newReindexDate
log.debug("+++ Finished ReindexJob (${(System.currentTimeMillis() - start) / 1000} s) +++")
} catch (Exception e) {
log.error("Could not update searchable index, ${e.class}: ${e.message}")
if (e instanceof org.apache.lucene.store.LockObtainFailedException || e instanceof org.compass.core.engine.SearchEngineException) {
log.info("This is a Lucene index locking exception.")
for (String subIndex in compass.searchEngineIndexManager.getSubIndexes()) {
if (compass.searchEngineIndexManager.isLocked(subIndex)) {
log.info("Releasing Lucene index lock for sub index ${subIndex}")
compass.searchEngineIndexManager.releaseLock(subIndex)
}
}
}
} finally {
ConcurrencyHelper.unlock(ConcurrencyHelper.Locks.LUCENE_INDEX, "ReindexJob")
}
}
}
Based on JMX CPU samples, it seems that Compass is doing some scheduling behind the scenes. From 1 minute CPU samples it seems like there are few things different when normal and 100% CPU instances are compared:
org.apache.lucene.index.IndexWriter.doWait() is using most of the CPU time.
Compass Scheduled Executor Thread is shown in the thread list, this was not seen in a normal situation.
One Compass Executor Thread is doing commitMerge, in a normal situation none of these threads was doing commitMerge.
You can try increasing the 'compass.transaction.lockTimeout' setting. The default is 10 (seconds).
Another option is to disable concurrency in Compass and make it synchronous. This is controlled with the 'compass.transaction.processor.read_committed.concurrentOperations': 'false' setting. You might also have to set 'compass.transaction.processor' to 'read_committed'
These are the compass settings we are currently using:
compassSettings = [
'compass.engine.optimizer.schedule.period': '300',
'compass.engine.mergeFactor':'1000',
'compass.engine.maxBufferedDocs':'1000',
'compass.engine.ramBufferSize': '128',
'compass.engine.useCompoundFile': 'false',
'compass.transaction.processor': 'read_committed',
'compass.transaction.processor.read_committed.concurrentOperations': 'false',
'compass.transaction.lockTimeout': '30',
'compass.transaction.lockPollInterval': '500',
'compass.transaction.readCommitted.translog.connection': 'ram://'
]
This has concurrency switched off. You can make it asynchronous by changing the 'compass.transaction.processor.read_committed.concurrentOperations' setting to 'true'. (or removing the entry).
Compass configuration reference:
http://static.compassframework.org/docs/latest/core-configuration.html
Documentation for the concurrency of read_committed processor:
http://www.compass-project.org/docs/latest/reference/html/core-searchengine.html#core-searchengine-transaction-read_committed
If you want to keep async operations, you can also control the number of threads it uses. Using compass.transaction.processor.read_committed.concurrencyLevel=1 setting would allow asynchronous operations but just use one thread (the default is 5 threads). There are also the compass.transaction.processor.read_committed.backlog and compass.transaction.processor.read_committed.addTimeout settings.
I hope this helps.

Resources