Problem while implementing join of two dataset in google cloud dataflow using Apache Beam

Problem while implementing join of two dataset in google cloud dataflow using Apache Beam - google-cloud-dataflow

I was trying to implement SQL on two dataset on google cloud storage using apache beam by following Apache Beam documentation https://beam.apache.org/documentation/dsls/sql/walkthrough/
But i am ended with the below exception :
An exception occured while executing the Java class. org.apache.beam.sdk.transforms.MapElements
.via(Lorg/apache/beam/sdk/transforms/SimpleFunction;)Lorg/apache/beam/sdk/transforms/MapElements;
I tried changing Beam-sdk-version and other code changes but none of them worked .
package com.nitesh.gcp.feature;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.sql.SqlTransform;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.sdk.values.TupleTag;
import java.util.stream.Collectors;
public class beamSQL1 {
public static final String EMPHEADER = "empno,ename,job,mgr,hiredate,sal,comm,deptno";
public static final String DEPTHEADER = "deptno,dname,location";
public static final Schema EMPSCHEMA = Schema.builder()
.addStringField("empno")
.addStringField("ename")
.addStringField("job")
.addStringField("mgr")
.addStringField("hiredate")
.addStringField("sal")
.addStringField("comm")
.addStringField("deptno")
.build();
public static final Schema DEPTSCHEMA = Schema.builder()
.addStringField("deptno")
.addStringField("dname")
.addStringField("location")
.build();
public static void main(String[] args) {
PipelineOptionsFactory.register(DataflowPipelineOptions.class);
DataflowPipelineOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(DataflowPipelineOptions.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> employee = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/employee.txt"));
PCollection<String> department = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/department.txt"));
PCollection<Row> employeeRow = employee.apply("Transform To Row", ParDo.of(new RowParDo())).setRowSchema(EMPSCHEMA);
PCollection<Row> departmentRow = department.apply("Transform To Row", ParDo.of(new RowParDoForDept())).setRowSchema(DEPTSCHEMA);
PCollectionTuple output = PCollectionTuple.of(new TupleTag<>("emp"), employeeRow).and(new TupleTag<>("dept"), departmentRow);
output.apply(
SqlTransform.query(
// "SELECT emp.empno,emp.ename,dept.deptno,dept.dname FROM emp JOIN dept ON emp.deptno = dept.deptno"))
"SELECT * from emp JOIN dept ON emp.deptno = dept.deptno"))
/* p2.apply("Transform Sql", SqlTransform.query(
"SELECT * " +
"FROM PCOLLECTION order by sal desc LIMIT 14")
)*/
.apply("TransForm To String", ParDo.of(new RowToString()))
.apply("Write To GCS", TextIO.write().to("gs://amazon-test/sqlData/output/outputSql.csv").withoutSharding());
pipeline.run();
}
//ParDo for String -> Row (SQL)
public static class RowParDo extends DoFn<String, Row> {
#ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(EMPHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(EMPSCHEMA)
.addValues(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7])
.build();
c.output(appRow);
}
}
}
//ParDo for Row (SQL) -> String
public static class RowToString extends DoFn<Row, String> {
#ProcessElement
public void processElement(ProcessContext c) {
String line = c.element().getValues()
.stream()
.map(Object::toString)
.collect(Collectors.joining(","));
c.output(line);
}
}
//ParDo for String -> Row (SQL)
public static class RowParDoForDept extends DoFn<String, Row> {
#ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(DEPTHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(DEPTSCHEMA)
.addValues(vals[0], vals[1], vals[2])
.build();
c.output(appRow);
}
}
}
}

Related

SpannerIO java.lang.IllegalStateException: Sorter should be null here

I am trying to write to Spanner from a DataFlow streaming job by using
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-io-google-cloud-platform</artifactId>
<version>2.18.0</version>
</dependency>
After mapping the data to PCollection<Mutation> I am writing them to Spanner via SpannerIO.write
Pipeline pipeline = Pipeline.create(options);
PCollection<Mutation> mutations = pipeline.apply...
mutations.apply("WriteMutations", SpannerIO.write()
.withInstanceId(INSTANCE_ID)
.withDatabaseId(DATABASE_ID)
);
pipeline.run();
However, it throws
java.lang.IllegalStateException: Sorter should be null here
at org.apache.beam.sdk.io.gcp.spanner.SpannerIO$GatherBundleAndSortFn.startBundle (SpannerIO.java:1080)
What would be the cause of this exception?
The following pipeline produces the exception. I test it with 20 workers but it looks like it's independent of the data load.
import com.google.cloud.spanner.Mutation;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage;
import org.apache.beam.sdk.io.gcp.spanner.SpannerIO;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.Repeatedly;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.joda.time.Duration;
import java.util.UUID;
public final class TestPipeline {
private static final Duration WINDOW_DURATION = Duration.standardSeconds(1);
private static final String DATABASE_ID = "test";
private static final String INSTANCE_ID = "test-spanner";
private static final String TEST_TABLE = "test";
public static void main(String[] args) {
TestPipelineOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(TestPipelineOptions.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("Read pubsub", PubsubIO.readMessagesWithAttributes()
.fromSubscription(options.getInputSubscription()))
.apply("Parse message", ParDo.of(new ProcessMessage()))
.apply("Windowing", Window.<Mutation>into(new GlobalWindows())
.triggering(Repeatedly.forever(
AfterProcessingTime.pastFirstElementInPane()
.plusDelayOf(WINDOW_DURATION)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes())
.apply("Write mutations", SpannerIO.write()
.withInstanceId(INSTANCE_ID)
.withDatabaseId(DATABASE_ID)
);
pipeline.run();
}
private static class ProcessMessage extends DoFn<PubsubMessage, Mutation> {
#ProcessElement
public void processElement(#Element final PubsubMessage message,
final OutputReceiver<Mutation> out) {
out.output(Mutation.newInsertOrUpdateBuilder(TEST_TABLE)
.set("id").to(UUID.randomUUID().toString())
.set("string").to("test")
.set("count").to(Long.MAX_VALUE)
.build()
);
}
}
interface TestPipelineOptions extends DataflowPipelineOptions {
void setInputSubscription(String inputSubscription);
#Description("Google Pubsub subscription id.")
String getInputSubscription();
}
}
Table CREATE TABLE test (id STRING(50) NOT NULL, string STRING(50) NOT NULL, count INT64) PRIMARY KEY (id);

This issue seems to occur with apache beam version 2.18, but not with version 2.17.
The issue with apache beam version 2.18 is tracked here: https://issues.apache.org/jira/browse/BEAM-9505

Beam : Not Event with SideInput into streaming pipeline with DataflowRunner

I ve tested sideinput into a streaming pipeline with DirectRunner and DataflowRunner with this code :
public class Testsideinput {
private static final Logger LOG = LoggerFactory.getLogger(Testsideinput.class);
static class RefreshCache extends DoFn<Long, String> {
private static final long serialVersionUID = 1;
private static final Random RANDOM = new Random();
#ProcessElement
public void processElement(ProcessContext c) {
c.output("A"+c.element());
c.output("B"+c.element());
c.output("C"+c.element());
c.output("D"+c.element());
c.output("E"+c.element());
c.output("F"+c.element());
}
}
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
Pipeline pipeline = Pipeline.create(options);
final PCollectionView<List<String>> sideInput2 =
pipeline.apply("TextIO", TextIO.read().from("<Put your gs://>))
.apply("viewTags", View.asList());
final PCollectionView<List<String>> sideInput =
pipeline.apply("GenerateSequence",
GenerateSequence
.from(0)
.withRate(1, Duration.standardSeconds(1)))
.apply("Window GenerateSequence",
Window.into(FixedWindows.of(Duration.standardSeconds(5))))
.apply("Counts", Combine.globally(Sum.ofLongs()).withoutDefaults())
.apply("RefreshCache", ParDo.of(new RefreshCache()))
.apply("viewTags", View.asList());
final PubsubIO.Read<PubsubMessage> pubsubRead =
PubsubIO.readMessages()
.withIdAttribute("id")
.withTimestampAttribute("ts")
.fromTopic("<put your topic>");
// PCollection<KV<String,Long>> taxi =;
PCollection<String> taxi =
pipeline.apply("Read from", pubsubRead)
.apply("Window Fixed",
Window.into(FixedWindows.of(Duration.standardSeconds(15))))
.apply(MapElements.via(new PubSubToTableRow()))
.apply("key rides by rideid",
MapElements
.into(TypeDescriptors
.kvs(TypeDescriptors.strings(),
TypeDescriptor.of(TableRow.class)))
.via(ride -> KV.of(ride.get("ride_id").toString(), ride)))
.apply("Count Per Element", Count.perKey())
.apply(
ParDo.of(new DoFn<KV<String,Long>, String>() {
#ProcessElement
public void processElement(
#Element KV<String,Long> value,
OutputReceiver<String> out, ProcessContext c) {
// In our DoFn, access the side input.
List<String> sideinput = c.sideInput(sideInput);
List<String> sideinput2 = c.sideInput(sideInput2);
LOG.info("sideinput" + sideinput.toString());
LOG.info("sideinput2 " + sideinput2.toString());
LOG.info("value " + value);
out.output("test");
}
}).withSideInputs(sideInput,sideInput2));
pipeline.run();
}
I have all value of my sideinput (list and map) on DirectRunner but I don't have value with DataflowRunner ( I have no output with View.CreatePCollectionView/ParDo(StreamingPCollectionViewWriter) step)
do you have an idea to solve this?

How to Batch By N Elements in Streaming Pipeline With Small Bundles?

I've implemented batching by N elements as described in this answer:
Can datastore input in google dataflow pipeline be processed in a batch of N entries at a time?
package com.example.dataflow.transform;
import com.example.dataflow.event.ClickEvent;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
import org.joda.time.Instant;
import java.util.ArrayList;
import java.util.List;
public class ClickToClicksPack extends DoFn> {
public static final int BATCH_SIZE = 10;
private List accumulator;
#StartBundle
public void startBundle() {
accumulator = new ArrayList(BATCH_SIZE);
}
#ProcessElement
public void processElement(ProcessContext c) {
ClickEvent clickEvent = c.element();
accumulator.add(clickEvent);
if (accumulator.size() >= BATCH_SIZE) {
c.output(accumulator);
accumulator = new ArrayList(BATCH_SIZE);
}
}
#FinishBundle
public void finishBundle(FinishBundleContext c) {
if (accumulator.size() > 0) {
ClickEvent clickEvent = accumulator.get(0);
long time = clickEvent.getClickTimestamp().getTime();
c.output(accumulator, new Instant(time), GlobalWindow.INSTANCE);
}
}
}
But when I run pipeline in streaming mode there are a lot of batches with just 1 or 2 elements. As I understand it's because of small bundles size. After running for a day average number of elements in batch is roughly 4. I really need it to be closer to 10 for better performance of the next steps.
Is there a way to control bundles size?
Or should I use "GroupIntoBatches" transform for this purpose. In this case it's not clear for me, what should be selected as a key.
UPDATE:
is it a good idea to use java thread id or VM hostname for a key to apply "GroupIntoBatches" transform?

I've ended up doing composite transform with "GroupIntoBatches" inside.
The following answer contains recommendations regarding key selection:
https://stackoverflow.com/a/44956702/4888849
In my current implementation I'm using random keys to achieve parallelism and I'm windowing events in order to emit results regularly even if there are less then BATCH_SIZE events by one key.
package com.example.dataflow.transform;
import com.example.dataflow.event.ClickEvent;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.GroupIntoBatches;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.windowing.FixedWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.joda.time.Duration;
import java.util.Random;
/**
* Batch clicks into packs of BATCH_SIZE size
*/
public class ClickToClicksPack extends PTransform, PCollection>> {
public static final int BATCH_SIZE = 10;
// Define window duration.
// After window's end - elements are emitted even if there are less then BATCH_SIZE elements
public static final int WINDOW_DURATION_SECONDS = 1;
private static final int DEFAULT_SHARDS_NUMBER = 20;
// Determine possible parallelism level
private int shardsNumber = DEFAULT_SHARDS_NUMBER;
public ClickToClicksPack() {
super();
}
public ClickToClicksPack(int shardsNumber) {
super();
this.shardsNumber = shardsNumber;
}
#Override
public PCollection> expand(PCollection input) {
return input
// assign keys, as "GroupIntoBatches" works only with key-value pairs
.apply(ParDo.of(new AssignRandomKeys(shardsNumber)))
.apply(Window.into(FixedWindows.of(Duration.standardSeconds(WINDOW_DURATION_SECONDS))))
.apply(GroupIntoBatches.ofSize(BATCH_SIZE))
.apply(ParDo.of(new ExtractValues()));
}
/**
* Assigns to clicks random integer between zero and shardsNumber
*/
private static class AssignRandomKeys extends DoFn> {
private int shardsNumber;
private Random random;
AssignRandomKeys(int shardsNumber) {
super();
this.shardsNumber = shardsNumber;
}
#Setup
public void setup() {
random = new Random();
}
#ProcessElement
public void processElement(ProcessContext c) {
ClickEvent clickEvent = c.element();
KV kv = KV.of(random.nextInt(shardsNumber), clickEvent);
c.output(kv);
}
}
/**
* Extract values from KV
*/
private static class ExtractValues extends DoFn>, Iterable> {
#ProcessElement
public void processElement(ProcessContext c) {
KV> kv = c.element();
c.output(kv.getValue());
}
}
}

Apache beam pipeline cannot be updated on Dataflow

I am running a beam pipeline on Google Cloud Dataflow. however, the pipeline cannot be updated with exactly the same code. The pipeline looks like pipeline overview. And the code is as follows
import com.google.common.collect.Iterables;
import com.google.common.primitives.Ints;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.GenerateSequence;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PipelineTest {
private static final Logger logger = LoggerFactory.getLogger(PipelineTest.class);
public static void main(String[] args) {
int[] shit = new int[1000];
for (int i = 0; i < shit.length; i++) {
shit[i] = i * i;
}
PipelineOptions options = PipelineOptionsFactory.create();
Pipeline pipeline = Pipeline.create(options);
PCollection<Iterable<Integer>> sideInput =
pipeline.apply("Create", Create.<Iterable<Integer>>of(Ints.asList(shit)));
PCollectionView<Iterable<Integer>> view =
sideInput.apply("CreateSideInput", View.asSingleton());
PCollection<String> done =
pipeline
.apply(
"FakeData",
GenerateSequence.from(0).to(50_000).withRate(10, Duration.standardSeconds(1)))
.apply(
"Map1",
ParDo.of(
new DoFn<Long, String>() {
#ProcessElement
public void processElement(ProcessContext ctx) {
Long element = ctx.element();
Iterable<Integer> v = ctx.sideInput(view);
String out = "element " + element + ", value " + Iterables.size(v);
logger.info("MAP1: " + out);
ctx.output(out);
}
})
.withSideInputs(view))
.apply(
"Map2",
ParDo.of(
new DoFn<String, String>() {
#ProcessElement
public void processElement(ProcessContext ctx) {
String element = ctx.element();
Iterable<Integer> v = ctx.sideInput(view);
String out = "element " + element + ", value " + Iterables.size(v);
logger.info("MAP2: " + out);
ctx.output(out);
}
})
.withSideInputs(view));
}
}
I tried to provide default value for the view as well as use two views. However, neither of them work. If the view is used in two independent transforms, the pipeline can be updated.

win:length(2) is fired after first event

I made a very simple test gui based on this brilliant article about getting started with Esper.
What surprises me is that this query is validated to true after the very first tick event is sent, if the price is above 6.
select * from StockTick(symbol='AAPL').win:length(2) having avg(price) > 6.0
As far as I understand, win:length(2) needs TWO ticks before an event is fired, or am I wrong?
Here is a SSCCE for this question, just press the "Create Tick Event" button and you will see the StockTick Event being fired at once.
It needs the following jars which comes bundled with Esper
esper\lib\antlr-runtime-3.2.jar
esper\lib\cglib-nodep-2.2.jar
esper\lib\commons-logging-1.1.1.jar
esper\lib\esper_3rdparties.license
esper\lib\log4j-1.2.16.jar
esper-4.11.0.jar
import javax.swing.JFrame;
import javax.swing.JSplitPane;
import javax.swing.SwingUtilities;
import java.awt.BorderLayout;
import java.awt.Dimension;
import javax.swing.JButton;
import javax.swing.JScrollPane;
import java.awt.TextArea;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.util.Date;
import java.util.Random;
import javax.swing.JRadioButton;
import javax.swing.JPanel;
import java.awt.GridLayout;
import javax.swing.JTextArea;
import javax.swing.ScrollPaneConstants;
import com.espertech.esper.client.Configuration;
import com.espertech.esper.client.EPAdministrator;
import com.espertech.esper.client.EPRuntime;
import com.espertech.esper.client.EPServiceProvider;
import com.espertech.esper.client.EPServiceProviderManager;
import com.espertech.esper.client.EPStatement;
import com.espertech.esper.client.EventBean;
import com.espertech.esper.client.UpdateListener;
import javax.swing.JTextField;
public class Tester extends JFrame {
/**
*
*/
private static final long serialVersionUID = 1L;
JButton createRandomValueEventButton;
private JPanel panel;
private JPanel southPanel;
private JPanel centerPanel;
private static JTextArea centerTextArea;
private static JTextArea southTextArea;
private static Random generator = new Random();
private EPRuntime cepRT;
private JSplitPane textSplitPane;
private JButton btnNewButton;
private static JTextField priceTextField;
public Tester() {
getContentPane().setLayout(new BorderLayout(0, 0));
JSplitPane splitPane = new JSplitPane();
createRandomValueEventButton = new JButton("Create Tick Event With Random Price");
splitPane.setLeftComponent(createRandomValueEventButton);
createRandomValueEventButton.addActionListener(new ActionListener() {
#Override
public void actionPerformed(ActionEvent e) {
createTickWithRandomPrice();
}
});
panel = new JPanel();
splitPane.setRightComponent(panel);
panel.setLayout(new GridLayout(1, 0, 0, 0));
btnNewButton = new JButton("Create Tick Event");
panel.add(btnNewButton);
btnNewButton.addActionListener(new ActionListener() {
#Override
public void actionPerformed(ActionEvent e) {
createTick();
}
});
priceTextField = new JTextField();
priceTextField.setText(new Integer(10).toString());
panel.add(priceTextField);
priceTextField.setColumns(4);
getContentPane().add(splitPane, BorderLayout.NORTH);
textSplitPane = new JSplitPane();
textSplitPane.setOrientation(JSplitPane.VERTICAL_SPLIT);
getContentPane().add(textSplitPane, BorderLayout.CENTER);
centerPanel = new JPanel();
centerPanel.setLayout(new BorderLayout(0, 0));
JScrollPane centerTextScrollPane = new JScrollPane();
centerTextScrollPane.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
centerTextArea = new JTextArea();
centerTextArea.setRows(12);
centerTextScrollPane.setViewportView(centerTextArea);
southPanel = new JPanel();
southPanel.setLayout(new BorderLayout(0, 0));
JScrollPane southTextScrollPane = new JScrollPane();
southTextScrollPane.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
southTextArea = new JTextArea();
southTextArea.setRows(5);
southTextScrollPane.setViewportView(southTextArea);
textSplitPane.setRightComponent(southTextScrollPane);
textSplitPane.setLeftComponent(centerTextScrollPane);
setupCEP();
}
public static void GenerateRandomTick(EPRuntime cepRT) {
double price = (double) generator.nextInt(10);
long timeStamp = System.currentTimeMillis();
String symbol = "AAPL";
Tick tick = new Tick(symbol, price, timeStamp);
System.out.println("Sending tick:" + tick);
centerTextArea.append(new Date().toString()+" Sending tick:" + tick+"\n");
cepRT.sendEvent(tick);
}
public static void GenerateTick(EPRuntime cepRT) {
double price = Double.parseDouble(priceTextField.getText());
long timeStamp = System.currentTimeMillis();
String symbol = "AAPL";
Tick tick = new Tick(symbol, price, timeStamp);
System.out.println("Sending tick:" + tick);
centerTextArea.append(new Date().toString()+" Sending tick: " + tick+"\n");
cepRT.sendEvent(tick);
}
public static void main(String[] args){
Tester tester = new Tester();
tester.setSize(new Dimension(570,500));
tester.setVisible(true);
}
private void createTickWithRandomPrice(){
SwingUtilities.invokeLater(new Runnable() {
public void run() {
GenerateRandomTick(getEPRuntime());
}
});
}
private void createTick(){
SwingUtilities.invokeLater(new Runnable() {
public void run() {
GenerateTick(getEPRuntime());
}
});
}
private void setupCEP(){
Configuration cepConfig = new Configuration();
cepConfig.addEventType("StockTick", Tick.class.getName());
EPServiceProvider cep = EPServiceProviderManager.getProvider("myCEPEngine", cepConfig);
cepRT = cep.getEPRuntime();
EPAdministrator cepAdm = cep.getEPAdministrator();
EPStatement cepStatement = cepAdm.createEPL(
"select * from " +
"StockTick(symbol='AAPL').win:length(2) " +
"having avg(price) > 6.0");
cepStatement.addListener(new CEPListener());
//System.out.println("cepStatement.getText(): "+cepStatement.getText());
}
private EPRuntime getEPRuntime(){
public static class Tick {
String symbol;
Double price;
Date timeStamp;
public Tick(String s, double p, long t) {
symbol = s;
price = p;
timeStamp = new Date(t);
}
public double getPrice() {return price;}
public String getSymbol() {return symbol;}
public Date getTimeStamp() {return timeStamp;}
#Override
public String toString() {
return symbol+" Price: " + price.toString();
}
}
public static class CEPListener implements UpdateListener {
}

Actually aggregation and conditions are independent of how many events are in data window. There are functions you could use to check whether a data window is "filled": the "leaving", "count" or "prevcount" for example.

For anyone interested,
changing the query to this solved the problem
select * from StockTick(symbol='AAPL').win:length_batch(2) having avg(price) > 6.0 and count(*) >= 2
Now an event will be triggered for every consecutive tick with the price higher than 6, in batches of two.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Problem while implementing join of two dataset in google cloud dataflow using Apache Beam - google-cloud-dataflow

Related

SpannerIO java.lang.IllegalStateException: Sorter should be null here

Beam : Not Event with SideInput into streaming pipeline with DataflowRunner

How to Batch By N Elements in Streaming Pipeline With Small Bundles?

Apache beam pipeline cannot be updated on Dataflow

win:length(2) is fired after first event

Categories

Resources