Siddhi QL : In-Memory table outer join with Input Stream

Siddhi QL : In-Memory table outer join with Input Stream - join

I want to calculate % of protocols appearing in a network traffic in continuous way such that these % keeps on being updated with new events. A pie chart is generated and updated with the percentages. Since I need both new and previous data for the calculation, I decided to use in-memory table to keep events for a longer time (say a day).
As event tables are usable only when joined with event streams, I chose outer join to get old values as well. Being interested in just protocols and their percentages, I need just two columns but I am unable to apply aggregate function in outer join. The query I have so far generated is:
#Import('MAINInStream:1.0.0')
define stream MAINInStream (ts string, uid string, id_orig_h string, id_orig_p int, id_resp_h string, id_resp_p int, proto string, service string, duration double, orig_bytes long, resp_bytes long, conn_state string, local_orig bool, local_resp bool, missed_bytes long, history string, orig_pkts long, orig_ip_bytes long, resp_pkts long, resp_ip_bytes long, tunnel_parents string, sensorname string);
#Export('ProtocolStream:1.0.0')
define stream ProtocolStream (protocol string, count int);
define table mem_conn_table (timestamp long, id_orig_h string, id_orig_p int, id_resp_h string, id_resp_p int, proto string);
from MAINInStream
select time:timestampInMilliseconds(time:dateAdd(str:replaceAll(ts,'T',' '), 5, 'hour',"yyyy-MM-dd HH:mm:ss"),'yyyy-MM-dd HH:mm') as timestamp, id_orig_h, id_orig_p, id_resp_h, id_resp_p, proto
insert into intermediateStream;
from MAINInStream
select time:timestampInMilliseconds(time:dateAdd(str:replaceAll(ts,'T',' '), 5, 'hour',"yyyy-MM-dd HH:mm:ss"),'yyyy-MM-dd HH:mm') as timestamp, id_orig_h, id_orig_p, id_resp_h, id_resp_p, proto
group by id_resp_p
insert into mem_conn_table;
from intermediateStream#window.externalTimeBatch(timestamp,1min, timestamp, 1min) as i right outer join mem_conn_table[time:dateDiff(time:currentTimestamp(),cast(timestamp,"string"), "yyyy-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss") == 0] as mc
on i.timestamp == mc.timestamp
SELECT (ifThenElse(mc.id_resp_p == 21,'FTP', ifThenElse(mc.id_resp_p == 22,'SSH', ifThenElse(mc.id_resp_p == 25,'SMTP', ifThenElse(mc.id_resp_p == 445,'SMB','MYSQL'))))) as protocol , cast(count(mc.id_resp_p),'int') as count
insert into ProtocolStream;
I am batching window with one external minute and then getting protocols and their counts, but it isn't giving me any output.
Any suggestions?

You cannot use outer joins with in-memory tables. If you need, you can emit events resides in in-memory table to a intermediate stream and use it for joining (guide). However, for you scenario you can use externalTime window, instead of going with event tables. Try something similar to below;
#Import('MAINInStream:1.0.0')
define stream MAINInStream (ts string, uid string, id_orig_h string, id_orig_p int, id_resp_h string, id_resp_p int, proto string, service string, duration double, orig_bytes long, resp_bytes long, conn_state string, local_orig bool, local_resp bool, missed_bytes long, history string, orig_pkts long, orig_ip_bytes long, resp_pkts long, resp_ip_bytes long, tunnel_parents string, sensorname string);
#Export('ProtocolStream:1.0.0')
define stream ProtocolStream (protocol string, count long);
#Export('PercentageStream:1.0.0')
define stream PercentageStream (protocol string, count long, percentage double);
from MAINInStream
select
time:timestampInMilliseconds(time:dateAdd(str:replaceAll(ts,'T',' '), 5, 'hour',"yyyy-MM-dd HH:mm:ss"),'yyyy-MM-dd HH:mm') as timestamp,
(ifThenElse(mc.id_resp_p == 21,'FTP', ifThenElse(mc.id_resp_p == 22,'SSH', ifThenElse(mc.id_resp_p == 25,'SMTP', ifThenElse(mc.id_resp_p == 445,'SMB','MYSQL'))))) as protocol
id_orig_h, id_orig_p, id_resp_h, id_resp_p, proto
insert into intermediateStream;
from intermediateStream#window.externalTime(timestamp, 1 day)
select timestamp, count() as totalCount
insert into totalCountStream;
from intermediateStream#window.externalTime(timestamp, 1 day)
select timestamp, protocol, count() as count
group by protocol
insert into perProtocolCountStream;
from perProtocolCountStream
select protocol, count
insert into ProtocolStream;
from totalCountStream#window.time(1 min) as tcs join perProtocolCountStream#window.time(1 min) as pcs
select pcs.protocol, pcs.count as count, ((pcs.count/tcs.totalCount)) * 100 as percentage
on tcs.timestamp == pcs.timestamp
insert into PercentageStream;

Related

I keep getting "No matching signature for operator = for argument types: STRING, INT64. Supported signature: ANY = ANY at [18:54] in Big Query

SELECT
station_id,
name,
number_of_rides AS number_of_rides_starting_at_station
FROM (
SELECT
start_station_id,
COUNT(*) number_of_rides
FROM bigquery-public-data.new_york.citibike_trips AS trips
GROUP BY start_station_id
) AS station_num_trips
INNER JOIN bigquery-public-data.new_york.citibike_stations
ON station_id = start_station_id
ORDER BY number_of_rides DESC
I keep getting
No matching signature for operator = for argument types: STRING, INT64. Supported signature: ANY = ANY at [18:54] in Big Query
I tried CAST to change the station_id to a string but it already is a string.
What am I doing wrong?

Looks like one of your columns is a string. BigQuery cannot proactively cast to the most probable types and compare values. You have to distinctively type-cast the values in your query:
SELECT
SAFE_CAST(station_id as INT64) as station_id,
name,
number_of_rides AS number_of_rides_starting_at_station
FROM (
SELECT
start_station_id,
COUNT(*) number_of_rides
FROM bigquery-public-data.new_york.citibike_trips AS trips
GROUP BY start_station_id
) AS station_num_trips
INNER JOIN bigquery-public-data.new_york.citibike_stations
ON SAFE_CAST(station_id AS INT64) = SAFE_CAST(start_station_id AS INT64)
ORDER BY number_of_rides DESC

stored procedure for inserting values of tables into another tables with automatic identifier as FK PGadmin4

I want to make an insert procedure for specific values of an table into another existing table.
The dificulty is that my excisting table has automatic identifiers as fk of another table.
When I insert them individually, I use a look up function. But now I don't known in PGadmin how to put this in a procedure. In oarcle I can use collections but in pgadmin I can't figure it out.
my tables:
CREATE TABLE public.field (field_id INT GENERATED BY DEFAULT AS IDENTITY
CONSTRAINT pk_veld_id PRIMARY KEY,
object_number INT,
experiment_number INT,
repetition INT,
plants_field INT,
stem_field INT,
plants_square_meter numeric,
stem_square_meter_start numeric,
stem_square_meter_end numeric,
date_extra_stem date,
row_number_field INT);
CREATE TABLE public.sticky_plates_fields (ID_sticky_plate INT GENERATED BY DEFAULT AS IDENTITY
CONSTRAINT pk_sticky_plate_id PRIMARY KEY,
sticky_plate_number INT,
brand_sticky_plate varchar,
version_plate numeric,
field_id INT constraint fk_sticky_plates_fields references field );
ALTER TABLE IF EXISTS public.sticky_plates_fields
ADD CONSTRAINT sticky_plates_fields_unique UNIQUE (sticky_plate_number, field_id);
DROP TABLE IF EXISTS public.make_sticky_plate_counts CASCADE;
CREATE TABLE public.make_sticky_plate_counts (
experiment_number INT,
object_number INT,
repetition INT,
sticky_plate_number INT,
brand_sticky_plate varchar,
version_plate numeric,
date_start date,
date_count date,
species varchar,
stage varchar,
count_species INT);
look_up _function:
CREATE OR REPLACE FUNCTION project.lookup_field_id
(p_objectnumber INT,
p_experimentnumber INT,
p_repetition INT)
RETURNS integer
LANGUAGE 'plpgsql'
AS
$BODY$
DECLARE
ln_field_id integer;
BEGIN
SELECT field_id INTO ln_field_id
FROM field
WHERE p_objectnumber = object_number AND p_experimentnumber = experiment_number AND p_repetition = repetition;
RETURN ln_field_id;
END;
$BODY$;
values:
insert into public.field(object_number,experiment_number,repetition,stem_field,stem_square_meter_start,stem_square_meter_end,date_extra_stem,row_number_field)
values (1,4072022,1,20,2.5,3.3,TO_DATE('1-04-2022','DD-MM-YYYY'),10);
insert into public.field(object_number,experiment_number,repetition,stem_field,stem_square_meter_start,stem_square_meter_end,date_extra_stem,row_number_field)
values (1,4072022,2,20,2.5,3.3,TO_DATE('1-04-2022','DD-MM-YYYY'),15);
insert into public.field(object_number,experiment_number,repetition,stem_field,stem_square_meter_start,stem_square_meter_end,date_extra_stem,row_number_field)
values (1,4072022,3,20,2.5,3.3,TO_DATE('1-04-2022','DD-MM-YYYY'),20);
insert into public.field(object_number,experiment_number,repetition,stem_field,stem_square_meter_start,stem_square_meter_end,date_extra_stem,row_number_field)
values (1,4072022,4,20,2.5,3.3,TO_DATE('1-04-2022','DD-MM-YYYY'),25);
insert into public.field(object_number,experiment_number,repetition,stem_field,stem_square_meter_start,stem_square_meter_end,date_extra_stem,row_number_field)
values (2,4072022,1,20,2.5,3.3,TO_DATE('1-04-2022','DD-MM-YYYY'),10);
insert into public.field(object_number,experiment_number,repetition,stem_field,stem_square_meter_start,stem_square_meter_end,date_extra_stem,row_number_field)
values (2,4072022,2,20,2.5,3.3,TO_DATE('1-04-2022','DD-MM-YYYY'),15);
insert into public.field(object_number,experiment_number,repetition,stem_field,stem_square_meter_start,stem_square_meter_end,date_extra_stem,row_number_field)
values (2,4072022,3,20,2.5,3.3,TO_DATE('1-04-2022','DD-MM-YYYY'),20);
insert into public.sticky_plates_fields(sticky_plate_number,brand_sticky_plate,version_plate,field_id)
values(2,'BIOBEST',3,project.lookup_field_id(1,4072022,2));
insert into public.sticky_plates_fields(sticky_plate_number,brand_sticky_plate,version_plate,field_id)
values(1,'BIOBEST',3,project.lookup_field_id(1,4072022,1));
insert into public.sticky_plates_fields(sticky_plate_number,brand_sticky_plate,version_plate,field_id)
values(3,'BIOBEST',3,project.lookup_field_id(1,4072022,3));
insert into public.sticky_plates_fields(sticky_plate_number,brand_sticky_plate,version_plate,field_id)
values(4,'BIOBEST',3,project.lookup_field_id(1,4072022,4));
insert into public.make_sticky_plate_counts(experiment_number,object_number,repetition,sticky_plate_number,brand_sticky_plate,version_plate,date_start,date_count,species,stage,count_species)
values(4072022,2,1,6,'BIOBEST',2.1,TO_DATE('1-04-2022','DD-MM-YYYY'),TO_DATE('14-04-2022','DD-MM-YYYY'),'WHITE_FLY_T','ADULT',12) ;
insert into public.make_sticky_plate_counts(experiment_number,object_number,repetition,sticky_plate_number,brand_sticky_plate,version_plate,date_start,date_count,species,stage,count_species)
values(4072022,2,2,7,'BIOBEST',2.1,TO_DATE('1-04-2022','DD-MM-YYYY'),TO_DATE('14-04-2022','DD-MM-YYYY'),'WHITE_FLY_T','ADULT',12) ;
insert into public.make_sticky_plate_counts(experiment_number,object_number,repetition,sticky_plate_number,brand_sticky_plate,version_plate,date_start,date_count,species,stage,count_species)
values(4072022,2,3,8,'BIOBEST',2.1,TO_DATE('1-04-2022','DD-MM-YYYY'),TO_DATE('14-04-2022','DD-MM-YYYY'),'WHITE_FLY_T','ADULT',12) ;
insert into public.make_sticky_plate_counts(experiment_number,object_number,repetition,sticky_plate_number,brand_sticky_plate,version_plate,date_start,date_count,species,stage,count_species)
values(4072022,2,4,9,'BIOBEST',2.1,TO_DATE('1-04-2022','DD-MM-YYYY'),TO_DATE('14-04-2022','DD-MM-YYYY'),'WHITE_FLY_T','ADULT',12) ;
try out of stored procedure
Here I want some values of table make_sticky_plate_counts to insert into table sticky_plates_fields.
I don't know how I can make a procedure to insert the whole (distinct table) winto the sticky plate field table and using the look up function for finding the related FK integer.
CREATE OR REPLACE PROCEDURE insert_records
() LANGUAGE 'plpgsql'
AS
$BODY$
DECLARE
p_object_number INT;
p_experiment_number INT;
p_r_epetitition INT;
p_sticky_plate_number INT;
p_brand_sticky_plate VARCHAR;
p_version_plate VARCHAR;
max_rownumbers_insert INT := 0;
BEGIN
max_rownumbers_insert := SELECT COUNT(*) FROM (SELECT DISTINCT object_number,experiment_number,repetition FROM make_sticky_plate_counts) as temp;
FOR i IN 1..max_rownumbers_insert
LOOP
p_object_number := SELECT object_number [i] FROM (SELECT DISTINCT object_number,experiment_number,repetition FROM make_sticky_plate_counts) as temp ;
p_experiment_number := SELECT experiment_number [i] FROM (SELECT DISTINCT object_number,experiment_number,repetition FROM make_sticky_plate_counts) as temp ;
p_repetitition:= SELECT repetitition [i] FROM (SELECT DISTINCT object_number,experiment_number,repetition FROM make_sticky_plate_counts) as temp ;
p_sticky_plate_number:=SELECT sticky_plate_number [i] FROM (SELECT DISTINCT object_number,experiment_number,repetition FROM make_sticky_plate_counts) as temp);
p_brand_sticky_plate :=SELECT brand_sticky_plate [i] FROM (SELECT DISTINCT object_number,experiment_number,repetition FROM make_sticky_plate_counts) as temp);
p_version_plate :=SELECT version_plate [i] FROM (SELECT DISTINCT object_number,experiment_number,repetition FROM make_sticky_plate_counts) as temp);
INSERT INTO sticky_plate_fields(field_id, sticky_plate_number, brand_sticky_plate,version_plate)
VALUES (project.lookup_field_id(p_object_number,p_experiment_number,p_repetition),p_sticky_plate_number,p_brand_sticky_plate,p_version_plate);
END LOOP; ```

I could figure it out. Maybe it is helpfull for someone else:
CREATE OR REPLACE PROCEDURE insert_records
() LANGUAGE 'plpgsql'
AS
$BODY$
DECLARE
curs cursor for select * FROM (SELECT DISTINCT object_number,experiment_number,repetition,sticky_plate_number,brand_sticky_plate,version_plate FROM make_sticky_plate_counts) as temp;
BEGIN
FOR row IN curs LOOP
INSERT INTO sticky_plates_fields(field_id, sticky_plate_number, brand_sticky_plate,version_plate)
VALUES (project.lookup_field_id(row.object_number,row.experiment_number,row.repetition),row.sticky_plate_number,row.brand_sticky_plate,row.version_plate);
END LOOP;
END ;
$BODY$;

functional construct for flattening array for multiple row insert sql query

Is there a way to generate multiple row sql query (values only) using some functional constructs of array?
I've an array of Roles that I want to insert into sqlite database.
struct Role {
var id: Int32
var name: String?
}
func updateByUserId(_ id: Int32, _ roles: [Role]) {
let sql = "INSERT INTO user_role(user_id, role_id) VALUES( \(id), \(roles.map..) )"
}
Expectation:
for instances if id is 1 and roles has an array [10, 11, 14, 15]
Generated SQL should be
INSERT INTO user_role(user_id, role_id) VALUES(1, 10), (1, 11), (1, 14), (1, 15)
SQL Syntax for Multiple row insert is
INSERT INTO MyTable ( Column1, Column2 ) VALUES(Value1, Value2),
(Value1, Value2)

You can map each role to the string (id, role), then join the array of strings with the separator ,:
let values = roles.map { "(\(id), \($0))" }.joined(separator: ", ")
let sql = "INSERT INTO user_role(user_id, role_id) VALUES\(values)"

Although for this particular scenario the SQL string computation is not problematic, it's good practice to use parametrized statements for every DB query.
Working exclusively with parametrized statements avoids vulnerabilities like SQL injection, or malformed queries that fail to execute (when dealing with strings instead of ints).
So, I'd recommend going via the above route by writing something like this:
func updateByUserId(_ id: Int32, _ roles: [Role]) -> (statement: String, params: [Int32]) {
let statement = "INSERT INTO user_role(user_id, role_id) VALUES " + Array(repeating: "(?, ?)", count: roles.count).joined(separator: ", ")
let params = roles.flatMap { [id, $0.id] }
return (statement, params)
}
For your example in the question, the output would be something like this:
(statement: "INSERT INTO user_role(user_id, role_id) VALUES (?, ?), (?, ?), (?, ?), (?, ?)", params: [1, 10, 1, 11, 1, 14, 1, 15])
You can then use the SQLite functions to create the parametrized statement and bind the given values to it.
P.S. There is also the matter of validating that the array of roles is not empty, in which case you'd get an invalid SQL as output. To handle this, you can make the function return an optional, and nil will signal an empty array. Doing this will enable a small performance improvement, as you'll be able to use String(repeating:count:), which is a little bit faster than creating an array and joing it later on:
func updateByUserId(_ id: Int32, _ roles: [Role]) -> (statement: String, params: [Int32])? {
guard !roles.isEmpty else { return nil }
return (statement: "INSERT INTO user_role(user_id, role_id) VALUES (?, ?)" + String(repeating: ", (?, ?)", count: roles.count - 1),
params: roles.flatMap { [id, $0.id] })
}

Spark join hangs

I have a table with n columns that I'll call A. In this table there are three columns that i'll need:
vat -> String
tax -> String
card -> String
vat or tax can be null, but not at the same time.
For every unique couple of vat and tax there is at least one card.
I need to alter this table, adding a column count_card in which I put a text based on the number of cards every unique combination of tax and vat has.
So I've done this:
val cardCount = A.groupBy("tax", "vat").count
val sqlCard = udf((count: Int) => {
if (count > 1)
"MULTI"
else
"MONO"
})
val B = cardCount.withColumn(
"card_count",
sqlCard(cardCount.col("count"))
).drop("count")
In the table B I have three columns now:
vat -> String
tax -> String
card_count -> Int
and every operation on this DataFrame is smooth.
Now, because I wanted to import the new column in A table, i performed the following join:
val result = A.join(B,
B.col("tax")<=>A.col("tax") and
B.col("vat")<=>A.col("vat")
).drop(B.col("tax"))
.drop(B.col("vat"))
Expecting to have the original table A with the column card_count.
Problem is that the join hangs, getting all system resources blocking the pc.
Additional details:
Table A has ~1.5M elements and is read from parquet file;
Table B has ~1.3M elements.
System is a 8 thread and 30GB of RAM
Let me know what I'm doing wrong

At the end, I didn't found out which was the issue, so I changed approach
val cardCount = A.groupBy("tax", "vat").count
val cardCountSet = cardCount.filter(cardCount.col("count") > 1)
.rdd.map(r => r(0) + " " + r(1)).collect().toSet
val udfCardCount = udf((tax: String, vat:String) => {
if (cardCountSet.contains(tax + " " + vat))
"MULTI"
else
"MONO"
})
val result = A.withColumn("card_count",
udfCardCount(A.col("tax"), A.col("vat")))
If someone knows a better approach let me know it

How to create a Sybase stored procedure

I'm new to Sybase and i need to create a stored procedure to gather space, basically convert these into a procedure:
create proc sp_maint
#dbname varchar(30), #segname varchar(30),
#devname varchar(30), #devsize int,
#freesize int, #free_percentage int
as declare #sizeinpg float,

create proc sp_maint
#dbname varchar(30),
#segname varchar(30),
#devname varchar(30),
#devsize int,
#freesize int,
#free_percentage int
as
declare #sizeinpg float,
#perc float,
#segbit int,
#seg int,
#pagefl float
BEGIN
/* for all segments */
select #seg = segment
from syssegments
where name = #segname
select DATE=convert(char(8),getdate(),1),
DB_NAME=db,
SEGMENT_NAME=seg,
Allocated_Space=convert(int,(round(size,0))),
Free_Space=convert(int,round(MBfree,0)),
Free_Percent=convert(int,(round(((MBfree/size)*100),1))),
/* get rid of blanks */
select #dbname = ltrim(rtrim(#dbname))
select #segname = ltrim(rtrim(#segname))
**strong text**

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Siddhi QL : In-Memory table outer join with Input Stream - join

Related

I keep getting "No matching signature for operator = for argument types: STRING, INT64. Supported signature: ANY = ANY at [18:54] in Big Query

stored procedure for inserting values of tables into another tables with automatic identifier as FK PGadmin4

functional construct for flattening array for multiple row insert sql query

Spark join hangs

How to create a Sybase stored procedure

Categories

Resources