py2neo unique nodes with unique relations given timestamp - neo4j

I am trying to create a graph that stores time based iterations between nodes. I would like the nodes to be unique and relationships between nodes to be unique given the timestamp property.
My first attempt creates 2 nodes and 1 relationship which is not what I want.
from py2neo import neo4j, node, rel
graph_db = neo4j.GraphDatabaseService()
graph_db.get_or_create_index(neo4j.Node, "node_index")
batch = neo4j.WriteBatch(graph_db)
# a TALKED_TO b at timestamp 0
batch.get_or_create_indexed_node('node_index', 'name', 'a', {'name': 'a'})
batch.get_or_create_indexed_node('node_index', 'name', 'b', {'name': 'b'})
batch.get_or_create_indexed_relationship('rel_index', 'type', 'TALKED_TO', 0, 'TALKED_TO', 1, {"timestamp": 0})
# a TALKED_TO b at timestamp 1
batch.get_or_create_indexed_node('node_index', 'name', 'a', {'name': 'a'})
batch.get_or_create_indexed_node('node_index', 'name', 'b', {'name': 'b'})
batch.get_or_create_indexed_relationship('rel_index', 'type', 'TALKED_TO', 3, 'TALKED_TO', 4, {"timestamp": 1})
# a TALKED_TO b at timestamp 2
batch.get_or_create_indexed_node('node_index', 'name', 'a', {'name': 'a'})
batch.get_or_create_indexed_node('node_index', 'name', 'b', {'name': 'b'})
batch.get_or_create_indexed_relationship('rel_index', 'type', 'TALKED_TO', 6, 'TALKED_TO', 7, {"timestamp": 0})
results = batch.submit()
print results
#[Node('http://localhost:7474/db/data/node/2'),
#Node('http://localhost:7474/db/data/node/3'),
#Relationship('http://localhost:7474/db/data/relationship/0'),
#Node('http://localhost:7474/db/data/node/2'),
#Node('http://localhost:7474/db/data/node/3'),
#Relationship('http://localhost:7474/db/data/relationship/0'),
#Node('http://localhost:7474/db/data/node/2'),
#Node('http://localhost:7474/db/data/node/3'),
#Relationship('http://localhost:7474/db/data/relationship/0')]
My second attempt creates 2 nodes and 0 relations, not sure why it fails to create any relationships.
from py2neo import neo4j, node, rel
graph_db = neo4j.GraphDatabaseService()
graph_db.get_or_create_index(neo4j.Node, "node_index")
batch = neo4j.WriteBatch(graph_db)
# a TALKED_TO b at timestamp 0
batch.get_or_create_indexed_node('node_index', 'name', 'a', {'name': 'a'})
batch.get_or_create_indexed_node('node_index', 'name', 'b', {'name': 'b'})
batch.create(rel(0, 'TALKED_TO', 1, {"timestamp": 0}))
# a TALKED_TO b at timestamp 1
batch.get_or_create_indexed_node('node_index', 'name', 'a', {'name': 'a'})
batch.get_or_create_indexed_node('node_index', 'name', 'b', {'name': 'b'})
batch.create(rel(3, 'TALKED_TO', 4, {"timestamp": 1}))
# a TALKED_TO b at timestamp 2
batch.get_or_create_indexed_node('node_index', 'name', 'a', {'name': 'a'})
batch.get_or_create_indexed_node('node_index', 'name', 'b', {'name': 'b'})
batch.create(rel(6, 'TALKED_TO', 7, {"timestamp": 0}))
results = batch.submit()
print results
#[Node('http://localhost:7474/db/data/node/2'),
#Node('http://localhost:7474/db/data/node/3'),
#None]
So how do I achieve what is depicted in the image below?

Okay so I think I figured it out but I'm not sure if its efficient. Does anyone know a better way than the following?
# Create nodes a and b if they do not exist.
query = """MERGE (p:Person { name: {name} }) RETURN p"""
cypher_query = neo4j.CypherQuery(neo4j_graph, query )
result = cypher_query .execute(name='a')
result = cypher_query .execute(name='b')
# Create a relationship between a and b if it does not exist with the given timestamp value.
query = """
MATCH (a:Person {name: {a}}), (b:Person {name: {b}})
MERGE (a)-[r:TALKED_TO {timestamp: {timestamp}}]->(b)
RETURN r
"""
cypher_query = neo4j.CypherQuery(neo4j_graph, query)
result = cypher_query.execute(a='a', b='b', timestamp=0)
result = cypher_query.execute(a='a', b='b', timestamp=1)

Related

Neo4j Cypher group by a column in a list of rows for aggregation

I have the following Neo4j Cypher query:
MATCH (v:Vacancy {deleted: false})-[vv:HAS_VOTE_ON]->(c:Criterion)<-[vp:HAS_VOTE_ON]-(p:Profile {id: 703, deleted: false})
WHERE vv.avgVotesWeight > 0 AND vv.avgVotesWeight <= vp.avgVotesWeight
WITH v, p
MATCH (v)-[vv1:HAS_VOTE_ON]->(cv:Criterion)
OPTIONAL MATCH (p)-[vp1:HAS_VOTE_ON]->(cv)
WITH v.id as vacancyId, cv.id as criterionId, coalesce(vv1.`properties.skillCoefficient`, 1.0) as vacancyCriterionCoefficient, coalesce(vp1.avgVotesWeight, 0) as profileCriterionVoteWeight, coalesce(vp1.totalVotes, 0) as profileCriterionTotalVotes
RETURN vacancyId, criterionId, vacancyCriterionCoefficient, profileCriterionVoteWeight, profileCriterionTotalVotes
which returns the following values:
Now, for each Vacancy (with the same vacancyId) I need to calculate totalProfileCriterionVoteWeight (SUM) for all criteria by the folowing formula:
vacancyCriterionCoefficient * profileCriterionVoteWeight
For this purpose, I need to group somehow the rows by vacancyId.
Could you please show how it is possible with a Cypher here?
You can replace your last line with:
WITH distinct(vacancyId) as vacancyId, sum(vacancyCriterionCoefficient * profileCriterionVoteWeight) as totalProfileCriterionVoteWeight
RETURN vacancyId, totalProfileCriterionVoteWeight
Which For the data shown in the picture will return:
╒═══════════╤═════════════════════════════════╕
│"vacancyId"│"totalProfileCriterionVoteWeight"│
╞═══════════╪═════════════════════════════════╡
│704 │22 │
├───────────┼─────────────────────────────────┤
│706 │16 │
└───────────┴─────────────────────────────────┘
Explanation: distinct allows to "group" the rows, only with an "accumulator" to other fields. Here we just needed to use SUM as an accumulator.
In order to test it, I used sample data:
MERGE (a:Node{vacancyId:704, criterionId: 6907, vacancyCriterionCoefficient: 1, profileCriterionVoteWeight: 1, profileCriterionTotalVotes: 1})
MERGE (b:Node{vacancyId:704, criterionId: 6909, vacancyCriterionCoefficient: 3, profileCriterionVoteWeight: 5, profileCriterionTotalVotes: 1})
MERGE (c:Node{vacancyId:704, criterionId: 6908, vacancyCriterionCoefficient: 2, profileCriterionVoteWeight: 3, profileCriterionTotalVotes: 1})
MERGE (d:Node{vacancyId:706, criterionId: 6909, vacancyCriterionCoefficient: 1, profileCriterionVoteWeight: 5, profileCriterionTotalVotes: 1})
MERGE (e:Node{vacancyId:706, criterionId: 6908, vacancyCriterionCoefficient: 3, profileCriterionVoteWeight: 3, profileCriterionTotalVotes: 1})
MERGE (f:Node{vacancyId:706, criterionId: 6907, vacancyCriterionCoefficient: 2, profileCriterionVoteWeight: 1, profileCriterionTotalVotes: 1})
And query:
MATCH (n)
WITH n.vacancyId as vacancyId, n.criterionId as criterionId, n.vacancyCriterionCoefficient as vacancyCriterionCoefficient, n.profileCriterionVoteWeight as profileCriterionVoteWeight, n.profileCriterionTotalVotes as profileCriterionTotalVotes
WITH distinct(vacancyId) as vacancyId, sum(vacancyCriterionCoefficient * profileCriterionVoteWeight) as totalProfileCriterionVoteWeight
//return vacancyId, criterionId, vacancyCriterionCoefficient, profileCriterionVoteWeight, profileCriterionTotalVotes
RETURN vacancyId, totalProfileCriterionVoteWeight
Which provide the results above

Neo4j: Match Merge throwing Neo.ClientError.Statement.SyntaxError

I was trying to run a query in Neo4j to make a relationship between a recipe and ingredients:
MATCH (spongeCake:Cake {name: "Sponge Cake"}),
(white:Flour {name: "white"}),
(egg:Ingredient {name: "egg"}),
(butter:Ingredient {name: "butter"}),
(milk:Ingredient {name: "milk"}),
(sugar:Ingredient {name: "sugar"}),
(brown:Flour {name: "brown"}),
MERGE (spongeCake)-[r:CONTAINS {quantity: 4, unit: "medium"}]->(egg),
(spongeCake)-[r:CONTAINS {quantity: 50, unit: "grams"}]->(brown),
(spongeCake)-[r:CONTAINS {quantity: 255, unit: "grams"}]->(sugar),
(spongeCake)-[r:CONTAINS {quantity: 25, unit: "grams"}]->(milk),
(spongeCake)-[r:CONTAINS {quantity: 300, unit: "grams"}]->(white),
(spongeCake)-[r:CONTAINS {quantity: 45, unit: "grams"}]->(butter);
For some reason MERGE is giving me a lot of trouble and I am getting the following error:
Invalid input 'MERGE': expected "(", "allShortestPaths" or "shortestPath" (line 9, column 1 (offset: 250))
"MERGE (spongeCake)-[r:CONTAINS {quantity: 4, unit: "medium"}]->(egg)"
^
How can I do this correctly?
This is the syntax of what you want to achieve.
There is a comma before Merge so it will not work.
Too many commas in match so it will create cartesian products
Many commas in MERGE and it will not work, so I removed it.
Learn the syntax well. Goodluck!
..
MATCH (spongeCake:Cake {name: "Sponge Cake"})
MATCH (white:Flour {name: "white"})
MATCH (egg:Ingredient {name: "egg"}),
MATCH (butter:Ingredient {name: "butter"})
MATCH (milk:Ingredient {name: "milk"})
MATCH (sugar:Ingredient {name: "sugar"})
MATCH (brown:Flour {name: "brown"})
MERGE (spongeCake)-[:CONTAINS {quantity: 4, unit: "medium"}]->(egg)
MERGE (spongeCake)-[:CONTAINS {quantity: 50, unit: "grams"}]->(brown)
MERGE (spongeCake)-[:CONTAINS {quantity: 255, unit: "grams"}]->(sugar)
MERGE (spongeCake)-[:CONTAINS {quantity: 25, unit: "grams"}]->(milk)
MERGE (spongeCake)-[:CONTAINS {quantity: 300, unit: "grams"}]->(white)
MERGE (spongeCake)-[:CONTAINS {quantity: 45, unit: "grams"}]->(butter)
RETURN spongeCake

multiple select take ages with snowflake

i have a table with 6M rows and it seems my query take ages.
I try to calculate values for 2 rolling months.
Input:
Period
ID
Tag
Name
Program
Total Cost
2017-06-01
ID1
X
User1
Program 1
438
2020-12-01
ID2
A
User2
Program 2
118
2020-12-01
ID3
X
User3
Program 3
380
Wanted output:
Period
ID
Tag
Name
Program
Total Cost
Period M-1
Total Cost M-1
Period M-2
Total Cost M-2
2017-06-01
ID1
X
User1
Program 1
438
2017-05-01
372
2017-04-01
340
2020-12-01
ID2
A
User2
Program 2
118
2020-11-01
103
2020-10-01
98
2020-12-01
ID3
X
User3
Program 3
380
2020-11-01
362
2020-10-01
334
Where am i wrong ? The below query is very slow.
WITH month_M AS (
SELECT "Period","ID","Tag","Name","Program","Cost USD",
DATEADD(MONTH, -1, "Period" ) AS "Period M-1 ",
DATEADD(MONTH, -2, "Period" ) AS "Period M-2"
FROM "ARROWSPHERE_PROD_DB"."PBI_SCH"."Revenue_Dashboard"
), month_M1 AS (
SELECT "Period","ID","Tag","Name","Program","Cost USD"
FROM "ARROWSPHERE_PROD_DB"."PBI_SCH"."Revenue_Dashboard"
), month_M2 AS (
SELECT "Period","ID","Tag","Name","Program","Cost USD"
FROM "ARROWSPHERE_PROD_DB"."PBI_SCH"."Revenue_Dashboard"
)
SELECT M."Period",M."ID",M."Tag",M."Name",M."Program",M."Cost USD",
M."Period M-1 ",M1."Cost USD" AS "Total Cost M-1",M."Period M-2",M2."Cost USD" AS "Total Cost M-2"
FROM month_M AS M,month_M1 AS M1, month_M2 AS M2
WHERE M."Period M-1 "=M1."Period" AND M."Period M-2"=M2."Period"
AND M."ID"=M1."ID" AND M."ID"=M2."ID"
AND M."Tag"=M1."Tag" AND M."Tag"=M2."Tag"
AND M."Name"=M1."Name" AND M."Name"=M2."Name"
AND M."Program"=M1."Program" AND M."Program"=M2."Program"
You can achieve your goal by using a Window Function like LAG, and reducing drastically your SQL code complexity and the execution plan that will perform the operation, which I guess will require one single table scan only (https://docs.snowflake.com/en/sql-reference/functions/lag.html)
CREATE OR REPLACE TEMPORARY TABLE TMP_TEST (
Period TIMESTAMP,
ID VARCHAR,
Tag VARCHAR,
Name VARCHAR,
Program VARCHAR,
TotalCost NUMERIC
);
INSERT INTO TMP_TEST
VALUES
('2020-10-01', 'ID2', 'A', 'User2', 'Program 2', 98),
('2020-11-01', 'ID2', 'A', 'User2', 'Program 2', 103),
('2020-12-01', 'ID2', 'A', 'User2', 'Program 2', 118),
('2020-10-01', 'ID3', 'X', 'User3', 'Program 3', 334),
('2020-11-01', 'ID3', 'X', 'User3', 'Program 3', 362),
('2020-12-01', 'ID3', 'X', 'User3', 'Program 3', 380);
SELECT * ,
DATEADD(MONTH, -1, Period) AS "Period M-1",
LAG(TotalCost, 1, 0) over (PARTITION BY Id, Tag, Name ORDER BY Period) AS "TotalCost M-2",
DATEADD(MONTH, -2, Period) AS "Period M-2",
LAG(TotalCost, 2, 0) OVER (PARTITION BY Id, Tag, Name ORDER BY Period) AS "TotalCost M-1"
FROM TMP_TEST
ORDER BY Id, Tag, Name, Period;
This is valid SQL so it's not "wrong" but since there are no predicates Snowflake must do a full table scan of 6e8 records, do processing and return about as many rows ...which is a lot of work to do.
If you can't just temporarily use a bigger warehouse, then you will have to dig into the Query Profile to find the bottleneck by clicking the query_id and then the "Profile" tab from the Worksheet UI.
First look at the Profile Overview and look at the breakdown of Remote IO to Processing.
You can reduce Remote IO by selecting fewer columns (if possible) or by using a predicate (like 1 year at a time, or users that start with X, or something... you may have to experiment.) You can click on a step to see how much was able to be pruned.
You can reduce processing by doing less :) which won't be easy but you could try a left join (example below) or a window query.
WITH rev_dash as (select $1 "Period", $2 "ID", $3 "Tag", $4 "Name", $5 "Program", $6 "Cost USD" from values
('2017-06-01', 'ID1', 'X', 'User1', 'Program 1', '438'),
('2020-12-01', 'ID2', 'A', 'User2', 'Program 2', '118'),
('2020-12-01', 'ID3', 'X', 'User3', 'Program 3', '380'),
('2017-05-01', 'ID1', 'X', 'User1', 'Program 1', '438'),
('2020-11-01', 'ID2', 'A', 'User2', 'Program 2', '118'),
('2020-11-01', 'ID3', 'X', 'User3', 'Program 3', '380'),
('2017-04-01', 'ID1', 'X', 'User1', 'Program 1', '438'),
('2020-10-01', 'ID2', 'A', 'User2', 'Program 2', '118'),
('2020-10-01', 'ID3', 'X', 'User3', 'Program 3', '380')
)
, month_M AS (
SELECT "Period","ID","Tag","Name","Program","Cost USD",
DATEADD(MONTH, -1, "Period" ) AS "Period M-1 ",
DATEADD(MONTH, -2, "Period" ) AS "Period M-2"
FROM rev_dash
), month_M1 AS (
SELECT "Period","ID","Tag","Name","Program","Cost USD"
FROM rev_dash
), month_M2 AS (
SELECT "Period","ID","Tag","Name","Program","Cost USD"
FROM rev_dash
)
SELECT M."Period",M."ID",M."Tag",M."Name",M."Program",M."Cost USD", M."Period M-1 ",M1."Cost USD" AS "Total Cost M-1",M."Period M-2",M2."Cost USD" AS "Total Cost M-2"
FROM month_M AS M left join month_M1 AS M1 left join month_M2 AS M2
on M."Period M-1 "=M1."Period" AND M."Period M-2"=M2."Period"
AND M."ID"=M1."ID" AND M."ID"=M2."ID"
AND M."Tag"=M1."Tag" AND M."Tag"=M2."Tag"
AND M."Name"=M1."Name" AND M."Name"=M2."Name"
AND M."Program"=M1."Program" AND M."Program"=M2."Program"
where "Total Cost M-2" is not null;

Neo4j query concerning two elements

I need help trying to do a query in Neo4j that I can't seem to figure out. The query is to return all cakes that contain both the ingredients: Milk and Cream.
Below is a snippet of a cake node and the ingredients (There are more ingredients and cakes but I didn't post them here as they are all formatted the same):
(brownies:Cake {name: "Brownies"}),
(brownies)-[:CONTAINS {quantity: 50, unit: "grams"}]->(white),
(brownies)-[:CONTAINS {quantity: 250, unit: "grams"}]->(selfraising),
(brownies)-[:CONTAINS {quantity: .5, unit: "grams"}]->(salt),
(brownies)-[:CONTAINS {quantity: 125, unit: "grams"}]->(sugar),
(brownies)-[:CONTAINS {quantity: 250, unit: "grams"}]->(cocoa),
(brownies)-[:CONTAINS {quantity: 125, unit: "grams"}]->(lemonade),
(brownies)-[:CONTAINS {quantity: 125, unit: "grams"}]->(cola),
(brownies)-[:GARNISHED_WITH {how: "chopped on top"}]->(cherry),
(brownies)-[:GARNISHED_WITH {how: "chopped on top"}]->(orange),
(limeJuice:Ingredient {name: "lime juice"}),
(cranberryJuice:Ingredient {name: "cranberry juice"}),
(lemonJuice:Ingredient {name: "lemon juice"}),
(orangeJuice:Ingredient {name: "orange juice"}),
(tomatoJuice:Ingredient {name: "tomato juice"}),
(lemonade:Ingredient {name: "lemonade"}),
(soda:Ingredient {name: "soda water"}),
(spice:Ingredient {name: "spice water"}),
(cola:Ingredient {name: "cola"}),
Neo4j seems to have trouble identifying ingredients but I'm not entirely sure that my query is formatted correctly regardless, here is what I have so far:
MATCH(x:Cake)-[:CONTAINS]-> (Ingredient: "milk" or "cream") Return x
Your Ingredient node check is problematic. Needs to be more like:
MATCH(x:Cake)-[:CONTAINS]-> (i:Ingredient)
WHERE i.name IN ['milk', 'cream']
Return x
Here is one way to get the cakes that contain ALL the ingredients from a list:
MATCH (cake:Cake)
WHERE ALL(x IN ['milk', 'cream'] WHERE (cake)-[:CONTAINS]->(:Ingredient{name: x}))
RETURN cake

Find and update a collection in JSONB

I have a Rails 5.0 app with a JSONB column called data, which contains an array of hashes:
[
{'event': 'web_session', 'user_id': 1, 'count': 13},
{'event': 'web_session', 'user_id': 2, 'count': 10},
{'event': 'web_session', 'user_id': 3, 'count': 42}
]
How would I update one of the hashes, e.g. matching 'user_id': 2, with a different count value?
Is this the most efficient way (I'd potentially have ~1 million hashes):
h = data.find {|h| h['user_id'] == 2}
h['count'] = 43
save

Resources