Below query takes long time to create temporary table, its only have "228000" distinct record.
DECLARE todate,fromdate DATETIME;
SET fromdate=DATE_SUB(UTC_TIMESTAMP(),INTERVAL 2 DAY);
SET todate=DATE_ADD(UTC_TIMESTAMP(),INTERVAL 14 DAY);
SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
DROP TEMPORARY TABLE IF EXISTS tempabc;
SET max_heap_table_size = 1024*1024*1024;
CREATE TEMPORARY TABLE IF NOT EXISTS tempabc
-- (index using BTREE(id))
ENGINE=MEMORY
AS
(
SELECT SQL_NO_CACHE DISTINCT id
FROM abc
WHERE StartTime BETWEEN fromdate AND todate
);
I already created index on 'startTime' coulmn, still it tooks 20 sec to create table. Kindly help me out to reduce the creation time.
More Info:-
I changed my query earlier I was using "tempabc" temporary table to get my output, now I am using IN clause instead of temporary table and now it is taking 12 sec to execute, but still more than expected time..
Earlier(taking 20-30 sec)
DECLARE todate,fromdate DATETIME;
SET fromdate=DATE_SUB(UTC_TIMESTAMP(),INTERVAL 2 DAY);
SET todate=DATE_ADD(UTC_TIMESTAMP(),INTERVAL 14 DAY);
SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
DROP TEMPORARY TABLE IF EXISTS tempabc;
SET max_heap_table_size = 1024*1024*1024;
CREATE TEMPORARY TABLE IF NOT EXISTS tempabc
-- (index using BTREE(id))
ENGINE=MEMORY
AS
(
SELECT SQL_NO_CACHE DISTINCT id
FROM abc
WHERE StartTime BETWEEN fromdate AND todate
);
SELECT DISTINCT p.xyzID
FROM tempabc s
JOIN xyz_tab p ON p.xyzID=s.ID AND IFNULL(IsGeneric,0)=0;
Now(taking 12-14 sec)
DECLARE todate,fromdate Timestamp;
SET fromdate=DATE_SUB(UTC_TIMESTAMP(),INTERVAL 2 DAY);
SET todate=DATE_ADD(UTC_TIMESTAMP(),INTERVAL 14 DAY);
SELECT p.xyzID FROM xyz_tab p
WHERE id IN (
SELECT DISTINCT id FROM abc
WHERE StartTime BETWEEN fromdate AND todate )
AND IFNULL(IsGeneric,0)=0 GROUP BY p.xyxID;
But we need to achieve 3-5 sec of execution time.
This is my explain output.
*************************** 1. row ***************************
id: 1
select_type: SIMPLE
table: abc
partitions: NULL
type: index
possible_keys: ix_starttime_id,IDX_Start_time,IX_id_starttime,IX_id_starttime_prgsvcid
key: IX_id_starttime
key_len: 163
ref: NULL
rows: 18779876
filtered: 1.27
Extra: Using where; Using index; Using temporary; Using filesort; LooseScan
*************************** 2. row ***************************
id: 1
select_type: SIMPLE
table: p
partitions: NULL
type: eq_ref
possible_keys: PRIMARY,IX_seriesid
key: PRIMARY
key_len: 152
ref: onconnectdb.abc.ID
rows: 1
filtered: 100.00
Extra: Using where
Explain in JSON format
EXPLAIN: {
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "10139148.44"
},
"grouping_operation": {
"using_temporary_table": true,
"using_filesort": true,
"cost_info": {
"sort_cost": "1.00"
},
"nested_loop": [
{
"table": {
"table_name": "abc",
"access_type": "index",
"possible_keys": [
"ix_starttime_tmsid",
"IDX_Start_time",
"IX_id_starttime",
"IX_id_starttime_prgsvcid"
],
"key": "IX_id_starttime",
"used_key_parts": [
"ID",
"StartTime",
"EndTime"
],
"key_length": "163",
"rows_examined_per_scan": 19280092,
"rows_produced_per_join": 264059,
"filtered": "1.37",
"using_index": true,
"loosescan": true,
"cost_info": {
"read_cost": "393472.45",
"eval_cost": "52812.00",
"prefix_cost": "446284.45",
"data_read_per_join": "2G"
},
"used_columns": [
"ID",
"StartTime"
],
"attached_condition": "(`onconnectdb`.`abc`.`StartTime` between <cache>(fromdate#1) and <cache>(todate#0))"
}
},
{
"table": {
"table_name": "p",
"access_type": "eq_ref",
"possible_keys": [
"PRIMARY",
"IX_seriesid"
],
"key": "PRIMARY",
"used_key_parts": [
"ID"
],
"key_length": "152",
"ref": [
"onconnectdb.abc.ID"
],
"rows_examined_per_scan": 1,
"rows_produced_per_join": 1,
"filtered": "100.00",
"cost_info": {
"read_cost": "9640051.00",
"eval_cost": "0.20",
"prefix_cost": "10139147.44",
"data_read_per_join": "2K"
},
"used_columns": [
"ID",
"xyzID",
"IsGeneric"
],
"attached_condition": "(ifnull(`onconnectdb`.`p`.`IsGeneric`,0) = 0)"
}
}
]
}
}
}
Please suggest.
Related
I have a compound index as follows.
index({ account_id: 1, is_private: 1, visible_in_list: 1, sent_at: -1, user_id: 1, status: 1, type: 1, 'tracking.last_opened_at' => -1 }, {name: 'email_page_index'})
Then I have a query with these exact fields,
selector:
{"account_id"=>BSON::ObjectId('id'), "is_private"=>false, "visible_in_list"=>{:$in=>[true, false]}, "status"=>{:$in=>["ok", "queued", "processing", "failed"]}, "sent_at"=>{"$lte"=>2021-03-22 15:29:18 UTC}, "tracking.last_opened_at"=>{"$gt"=>1921-03-22 15:29:18 UTC}, "user_id"=>BSON::ObjectId('id')}
options: {:sort=>{"tracking.last_opened_at"=>-1}}
The winningPlan is the following
"inputStage": {
"stage": "SORT_KEY_GENERATOR",
"inputStage": {
"stage": "FETCH",
"filter": {
"$and": [
{
"account_id": {
"$eq": {
"$oid": "objectid"
}
}
},
{
"is_private": {
"$eq": false
}
},
{
"sent_at": {
"$lte": "2021-03-22T14:06:10.000Z"
}
},
{
"tracking.last_opened_at": {
"$gt": "1921-03-22T14:06:10.716Z"
}
},
{
"status": {
"$in": [
"failed",
"ok",
"processing",
"queued"
]
}
},
{
"visible_in_list": {
"$in": [
false,
true
]
}
}
]
},
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"user_id": 1
},
"indexName": "user_id_1",
"isMultiKey": false,
"multiKeyPaths": {
"user_id": []
},.....
And the rejected plan has the compound index and forms as follows
"rejectedPlans": [
{
"stage": "FETCH",
"inputStage": {
"stage": "SORT",
"sortPattern": {
"tracking.last_opened_at": -1
},
"inputStage": {
"stage": "SORT_KEY_GENERATOR",
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"account_id": 1,
"is_private": 1,
"visible_in_list": 1,
"sent_at": -1,
"user_id": 1,
"status": 1,
"type": 1,
"tracking.last_opened_at": -1
},
"indexName": "email_page_index",
"isMultiKey": false,
"multiKeyPaths": {
"account_id": [],
"is_private": [],
"visible_in_list": [],
"sent_at": [],
"user_id": [],
"status": [],
"type": [],
"tracking.last_opened_at": []
},
"isUnique": false,
The problem is that the winningPlan is slow, wouldn't be better if mongoid choose the compound index? Is there a way to force it?
Also, how can I see the execution time for each separate STAGE?
I am posting some information that can help resolve the issue of performance and use an appropriate index. Please note this may not be the solution (and the issue is open to discussion).
...Also, how can I see the execution time for each separate STAGE?
For this, generate the query plan using the explain with the executionStats verbosity mode.
The problem is that the winningPlan is slow, wouldn't be better if
mongoid choose the compound index? Is there a way to force it?
As posted the plans show a "stage": "SORT_KEY_GENERATOR", implying that the sort operation is being performed in the memory (that is not using an index for the sort). That would be one (or main) of the reasons for the slow performance. So, how to make the query and the sort use the index?
A single compound index can be used for a query with a filter+sort operations. That would be an efficient index and query. But, it requires that the compound index be defined in a certain way - some rules need to be followed. See this topic on Sort and Non-prefix Subset of an Index - as is the case in this post. I quote the example from the documentation for illustration:
Suppose there is a compound index: { a: 1, b: 1, c: 1, d: 1 }
And, all the fields are used in a query with filter+sort. The ideal query is, to have a filter+sort as follows:
db.test.find( { a: "val1", b: "val2", c: 1949 } ).sort( { d: 1 })
Note the query filter has three fields with equality condition (there are no $gt, $lt, etc.). Then the query's sort has the last field d of the index. This is the ideal situation where the index will be used for the query''s filter as well as sort operations.
In your case, this cannot be applied from the posted query. So, to work towards a solution you may have to define a new index so as to take advantage of the rule Sort and Non-prefix Subset of an Index.
Is it possible? It depends upon your application and the use case. I have an idea like this and it may help. Create a compound index like the follows and see how it works:
account_id: 1,
is_private: 1
visible_in_list: 1,
status: 1,
user_id: 1,
'tracking.last_opened_at': -1
I think having a condition "tracking.last_opened_at"=>{"$gt"=>1921-03-22 15:29:18 UTC}, in the query''s filter may not help for the usage of the index.
Also, include some details like the version of the MongoDB server, the size of collection and some platform details. In general, query performance depends upon many factors, including, indexes, RAM memory, size and type of data, and the kind of operations on the data.
The ESR Rule:
When using compound index for a query with multiple filter conditions and sort, sometimes the Equality Sort Range rule is useful for optimizing the query. See the following post with such a scenario: MongoDB - Index not being used when sorting and limiting on ranged query
I have a condition where I need to get the taskId value of the dictionary where taskcode is "LIFE_MAX_DAYS". How can I do ?
Dictionary
replFlag: null (Text)
taskCode: "LIFE_MAX_DAYS"
createdBy: "Administrator"
createdOn: 3/20/2020 1:54 AM EDT
lastModifiedBy: "Administrator"
lastModifiedOn: 3/20/2020 1:54 AM EDT
actionId: null (Number (Integer))
priorityId: 5
statusId: null (Number (Integer))
concatKey: null (Text)
taskId: 5980
batchId: null (Number (Integer))
id: 4
Dictionary
replFlag: null (Text)
taskCode: "LIFE_MAX_DAYS"
createdBy: "Administrator"
createdOn: 3/20/2020 1:54 AM EDT
lastModifiedBy: "Administrator"
lastModifiedOn: 3/20/2020 1:54 AM EDT
actionId: null (Number (Integer))
priorityId: 5
statusId: null (Number (Integer))
concatKey: null (Text)
taskId: 5980
batchId: null (Number (Integer))
id: 5
wherecontains finds the indicies where an array value is provided.
Furthermore, lists support multi-indexing (e. g. {"a", "b", "c", "d"}[{1, 3}] returns {"a", "c"}).
Finally lists support projected indexing (e. g. {{x: 1}, {x: 2}, {x: 3}}.x returns {1, 2, 3})
These three features allow you to do this:
with(
/* Pull out just the task codes of all the tasks */
taskCodesOfTasks: listOfTasks.taskCode,
/* Get the indicies where the task code is what we're looking for */
indicies: wherecontains("LIFE_MAX_DAYS", taskCodesOfTasks),
/* Pull out the task data */
selectedTasks: listOfTasks[indicies],
/* Return the task IDs */
selectedTasks.taskId
)
Which can of course be spelled with much less ceremony:
listOfTasks[wherecontains("LIFE_MAX_DAYS", listOfTasks.taskCode)].taskId
I have a json as mentioned below,
{
"list": [{
"notificationId": 123,
"userId": 444
},
{
"notificationId": 456,
"userId": 789
}
]
}
I need to write a postgres procedure which interates through the list and perform either update/insert based on notification id is already present or not in DB.
I have a notification table which has notificationid and userID as columns.
Can anyone please tell me on how to perform this using postgres json operators.
Try this query:
SELECT *
FROM yourTable
WHERE col->'list'#>'[{"notificationId":123}]';
You may replace the value 123 with whatever notificationId you want to search. Follow the link below for a demo showing that this logic works:
Demo
Assuming you have a unique constraint on notificationid (e.g. because it's the primary key, there is no need for stored function or loop:
with data (j) as (
values ('
{
"list": [{
"notificationId": 123,
"userId": 444
},
{
"notificationId": 456,
"userId": 789
}
]
}'::jsonb)
)
insert into notification (notificationid, userid)
select (e.r ->> 'notificationId')::int, (e.r ->> 'userId')::int
from data d, jsonb_array_elements(d.j -> 'list') as e(r)
on conflict (notificationid) do update
set userid = excluded.userid;
The first step in that statement is to turn the array into a list of rows, this is what:
select e.*
from data d, jsonb_array_elements(d.j -> 'list') as e(r)
does. Given your sample JSON, this returns two rows with a JSON value in each:
r
--------------------------------------
{"userId": 444, "notificationId": 123}
{"userId": 789, "notificationId": 456}
This is then split into two integer columns:
select (e.r ->> 'notificationId')::int, (e.r ->> 'userId')::int
from data d, jsonb_array_elements(d.j -> 'list') as e(r)
So we get:
int4 | int4
-----+-----
123 | 444
456 | 789
And this result is used as the input for an INSERT statement.
The on conflict clause then does an insert or update depending on the presence of the row identified by the column notificationid which has to have a unique index.
Meanwhile i tried this,
CREATE OR REPLACE FUNCTION insert_update_notifications(notification_ids jsonb) RETURNS void AS
$$
DECLARE
allNotificationIds text[];
indJson jsonb;
notIdCount int;
i json;
BEGIN
FOR i IN SELECT * FROM jsonb_array_elements(notification_ids)
LOOP
select into notIdCount count(notification_id) from notification_table where notification_id = i->>'notificationId' ;
IF(notIdCount = 0 ) THEN
insert into notification_table(notification_id,userid) values(i->>'notificationId',i->>'userId');
ELSE
update notification_table set userid = i->>'userId' where notification_id = i->>'notificationId';
END IF;
END LOOP;
END;
$$
language plpgsql;
select * from insert_update_notifications('[{
"notificationId": "123",
"userId": "444"
},
{
"notificationId": "456",
"userId": "789"
}
]');
It works.. Please review this.
I am at a bit of a loss trying to figure out what is going on here:
I get results for this query:
SELECT value FROM "measures" WHERE time <= 1465195336002ms ORDER BY time desc
{
"results": [
{
"statement_id": 0,
"series": [
{
"name": "measures",
"columns": [
"time",
"value"
],
"values": [
[
1465195336000,
87.4
],
[
1464596862000,
86.66
],
[
1464070337000,
86.64
],
[
1463985100000,
86.77
]
]
}
]
}
]
}
All well and good, as expected.
But if I issue the following query, I get no results. Clearly this should match the same rows as above minus the first result:
SELECT value FROM "measures" WHERE time <= 1464596862000ms ORDER BY time desc
{
"results": [
{
"statement_id": 0
}
]
}
I figured it out, although far from obvious, it seems that this behaviour occurs when there is more than 1 measurement recorded for a given time period.
I've set up a Druid cluster to ingest real-time data from Kafka.
Question
Does Druid support fetching data that's sorted by timestamp? For example, let's say I need to retrieve the latest 10 entries from a Datasource X. Can I do this by using a LimitSpec (in the Query JSON) that includes the timestamp field? Or is there another better option supported Druid?
Thanks in advance.
Get unaggregated rows
To get unaggregated rows, you can do a query with "queryType: "select".
Select queries are also useful when pagination is needed - they let you set a page size, and automatically return a paging identifier for use in future queries.
In this example, if we just want the top 10 rows, we can pass in "pagingSpec": { "pageIdentifiers": {}, "threshold": 10 }.
Order by timestamp
To order these rows by "timestamp", you can pass in "descending": "true".
Looks like most Druid query types support the descending property.
Example Query:
{
"queryType": "select",
"dataSource": "my_data_source",
"granularity": "all",
"intervals": [ "2017-01-01T00:00:00.000Z/2017-12-30T00:00:00.000Z" ],
"descending": "true",
"pagingSpec": { "pageIdentifiers": {}, "threshold": 10 }
}
Docs on "select" type queries
You can use a group by query to do this, So group by __time as an extraction function then set granularity to all and use the limitSpec to sort/limit that will work. Now if you want to use a timeseries query it is more tricky to get the latest 10. One way to do it is to set the granularity to the desired one let say Hour then set the interval to be 10H starting from the most recent point in time. This sounds more easy to say than achieve. I will go the first way unless you have a major performance issue.
{
"queryType": "groupBy",
"dataSource": "wikiticker",
"granularity": "all",
"dimensions": [
{
"type": "extraction",
"dimension": "__time",
"outputName": "extract_time",
"extractionFn": {
"type": "timeFormat"
}
},
],
"limitSpec": {
"type": "default",
"limit": 10,
"columns": [
{
"dimension": "extract_time",
"direction": "descending"
}
]
},
"aggregations": [
{
"type": "count",
"name": "$f2"
},
{
"type": "longMax",
"name": "$f3",
"fieldName": "added"
}
],
"intervals": [
"1900-01-01T00:00:00.000/3000-01-01T00:00:00.000"
]
}