Beam : Not Event with SideInput into streaming pipeline with DataflowRunner

Beam : Not Event with SideInput into streaming pipeline with DataflowRunner - google-cloud-dataflow

I ve tested sideinput into a streaming pipeline with DirectRunner and DataflowRunner with this code :
public class Testsideinput {
private static final Logger LOG = LoggerFactory.getLogger(Testsideinput.class);
static class RefreshCache extends DoFn<Long, String> {
private static final long serialVersionUID = 1;
private static final Random RANDOM = new Random();
#ProcessElement
public void processElement(ProcessContext c) {
c.output("A"+c.element());
c.output("B"+c.element());
c.output("C"+c.element());
c.output("D"+c.element());
c.output("E"+c.element());
c.output("F"+c.element());
}
}
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
Pipeline pipeline = Pipeline.create(options);
final PCollectionView<List<String>> sideInput2 =
pipeline.apply("TextIO", TextIO.read().from("<Put your gs://>))
.apply("viewTags", View.asList());
final PCollectionView<List<String>> sideInput =
pipeline.apply("GenerateSequence",
GenerateSequence
.from(0)
.withRate(1, Duration.standardSeconds(1)))
.apply("Window GenerateSequence",
Window.into(FixedWindows.of(Duration.standardSeconds(5))))
.apply("Counts", Combine.globally(Sum.ofLongs()).withoutDefaults())
.apply("RefreshCache", ParDo.of(new RefreshCache()))
.apply("viewTags", View.asList());
final PubsubIO.Read<PubsubMessage> pubsubRead =
PubsubIO.readMessages()
.withIdAttribute("id")
.withTimestampAttribute("ts")
.fromTopic("<put your topic>");
// PCollection<KV<String,Long>> taxi =;
PCollection<String> taxi =
pipeline.apply("Read from", pubsubRead)
.apply("Window Fixed",
Window.into(FixedWindows.of(Duration.standardSeconds(15))))
.apply(MapElements.via(new PubSubToTableRow()))
.apply("key rides by rideid",
MapElements
.into(TypeDescriptors
.kvs(TypeDescriptors.strings(),
TypeDescriptor.of(TableRow.class)))
.via(ride -> KV.of(ride.get("ride_id").toString(), ride)))
.apply("Count Per Element", Count.perKey())
.apply(
ParDo.of(new DoFn<KV<String,Long>, String>() {
#ProcessElement
public void processElement(
#Element KV<String,Long> value,
OutputReceiver<String> out, ProcessContext c) {
// In our DoFn, access the side input.
List<String> sideinput = c.sideInput(sideInput);
List<String> sideinput2 = c.sideInput(sideInput2);
LOG.info("sideinput" + sideinput.toString());
LOG.info("sideinput2 " + sideinput2.toString());
LOG.info("value " + value);
out.output("test");
}
}).withSideInputs(sideInput,sideInput2));
pipeline.run();
}
I have all value of my sideinput (list and map) on DirectRunner but I don't have value with DataflowRunner ( I have no output with View.CreatePCollectionView/ParDo(StreamingPCollectionViewWriter) step)
do you have an idea to solve this?

Related

Problem while implementing join of two dataset in google cloud dataflow using Apache Beam

I was trying to implement SQL on two dataset on google cloud storage using apache beam by following Apache Beam documentation https://beam.apache.org/documentation/dsls/sql/walkthrough/
But i am ended with the below exception :
An exception occured while executing the Java class. org.apache.beam.sdk.transforms.MapElements
.via(Lorg/apache/beam/sdk/transforms/SimpleFunction;)Lorg/apache/beam/sdk/transforms/MapElements;
I tried changing Beam-sdk-version and other code changes but none of them worked .
package com.nitesh.gcp.feature;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.sql.SqlTransform;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.sdk.values.TupleTag;
import java.util.stream.Collectors;
public class beamSQL1 {
public static final String EMPHEADER = "empno,ename,job,mgr,hiredate,sal,comm,deptno";
public static final String DEPTHEADER = "deptno,dname,location";
public static final Schema EMPSCHEMA = Schema.builder()
.addStringField("empno")
.addStringField("ename")
.addStringField("job")
.addStringField("mgr")
.addStringField("hiredate")
.addStringField("sal")
.addStringField("comm")
.addStringField("deptno")
.build();
public static final Schema DEPTSCHEMA = Schema.builder()
.addStringField("deptno")
.addStringField("dname")
.addStringField("location")
.build();
public static void main(String[] args) {
PipelineOptionsFactory.register(DataflowPipelineOptions.class);
DataflowPipelineOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(DataflowPipelineOptions.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> employee = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/employee.txt"));
PCollection<String> department = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/department.txt"));
PCollection<Row> employeeRow = employee.apply("Transform To Row", ParDo.of(new RowParDo())).setRowSchema(EMPSCHEMA);
PCollection<Row> departmentRow = department.apply("Transform To Row", ParDo.of(new RowParDoForDept())).setRowSchema(DEPTSCHEMA);
PCollectionTuple output = PCollectionTuple.of(new TupleTag<>("emp"), employeeRow).and(new TupleTag<>("dept"), departmentRow);
output.apply(
SqlTransform.query(
// "SELECT emp.empno,emp.ename,dept.deptno,dept.dname FROM emp JOIN dept ON emp.deptno = dept.deptno"))
"SELECT * from emp JOIN dept ON emp.deptno = dept.deptno"))
/* p2.apply("Transform Sql", SqlTransform.query(
"SELECT * " +
"FROM PCOLLECTION order by sal desc LIMIT 14")
)*/
.apply("TransForm To String", ParDo.of(new RowToString()))
.apply("Write To GCS", TextIO.write().to("gs://amazon-test/sqlData/output/outputSql.csv").withoutSharding());
pipeline.run();
}
//ParDo for String -> Row (SQL)
public static class RowParDo extends DoFn<String, Row> {
#ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(EMPHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(EMPSCHEMA)
.addValues(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7])
.build();
c.output(appRow);
}
}
}
//ParDo for Row (SQL) -> String
public static class RowToString extends DoFn<Row, String> {
#ProcessElement
public void processElement(ProcessContext c) {
String line = c.element().getValues()
.stream()
.map(Object::toString)
.collect(Collectors.joining(","));
c.output(line);
}
}
//ParDo for String -> Row (SQL)
public static class RowParDoForDept extends DoFn<String, Row> {
#ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(DEPTHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(DEPTSCHEMA)
.addValues(vals[0], vals[1], vals[2])
.build();
c.output(appRow);
}
}
}
}

Unit test hangs forever if `DoFn` resets event timers

I'm unit testing (with TestStream and PAssert) a DoFn that resets event timers. Test hangs forever if DoFn resets timers and this behavior seems specific to event domain timers.
Is this a bug in beam testing facilities or expected timer behavior?
Here is a toy example that I can reproduce this behavior with beam 2.3 SDK.
static class KeyElements extends DoFn<String, KV<String, String>> {
#ProcessElement
public void processElement(ProcessContext context) {
final String[] parts = context.element().split(":");
if (parts.length == 2) {
context.output(KV.of(parts[0], parts[1]));
}
}
}
static class TimerDoFn extends DoFn<KV<String, String>, KV<String, String>> {
#TimerId("expiry")
private final TimerSpec timerSpec = TimerSpecs.timer(TimeDomain.EVENT_TIME);
#ProcessElement
public void processElement(ProcessContext context, #TimerId("expiry") Timer timer) {
timer.set(context.timestamp().plus(Duration.standardHours(1)));
final KV<String, String> e = context.element();
context.output(KV.of(e.getKey(), e.getValue() + "_output"));
}
#OnTimer("expiry")
public void onExpiry(OnTimerContext context) {
// do nothing
}
}
#Rule
public TestPipeline p = TestPipeline.create();
#Test
public void testTimerDoFn() {
TestStream<String> stream = TestStream
.create(StringUtf8Coder.of())
.addElements(
TimestampedValue.of("a:0", new Instant(0)),
TimestampedValue.of("a:1", new Instant(1)),
TimestampedValue.of("a:2", new Instant(2)),
TimestampedValue.of("a:3", new Instant(3)))
.advanceWatermarkToInfinity();
PCollection<KV<String, String>> result = p
.apply(stream)
.apply(ParDo.of(new KeyElements()))
.apply(ParDo.of(new TimerDoFn()));
PAssert.that(result).containsInAnyOrder(
KV.of("a", "0_output"),
KV.of("a", "1_output"),
KV.of("a", "2_output"),
KV.of("a", "3_output"));
p.run();
}
The above test would halt if input elements are a:1, b:2, c:3, d:4.

Unexpected behavior change from dataflow 1.9 to 2.0/2.1

We found a very strange difference between Dataflow SDK 1.9 and 2.0/2.1 for a very simple pipeline.
We have CoGroupByKey step that joins two PCollections by their keys and outputs two PCollections (via TupleTags). For instance, one PCollection may contain {"str1", "str2"} and the other may contains {"str3"}.
These two PCollections are written to GCS (at different locations), and their union (basically, the PCollection produced by applying Flatten on the two PCollections) would be used by subsequent steps in a pipeline. Using the previous example, we will store {"str1", "str2"} and {"str3"} in GCS under respective locations, and the pipeline will further transform their union (Flattened PCollection) {"str1", "str2", "str3"}, and so on.
In Dataflow SDK 1.9, that is exactly what is happening, and we've built our pipelines around this logic.
As we were slowly migrating to 2.0/2.1, we noticed that this behavior is no longer observed. Instead, all the steps followed by the Flatten step are run correctly and as expected, but those two PCollections (being Flattened) are no longer written to GCS as if they are nonexistent. In the execution graph though, the steps are shown, and this is very strange to us.
We were able to reproduce this issue reliably so that we can share the data and code as an example.
We have two text files stored in GCS:
data1.txt:
k1,v1
k2,v2
data2.txt:
k2,w2
k3,w3
We will read these two files to create two PCollections, a PC for each file.
We'll parse each line to create KV<String, String> (so the keys are k1, k2, k3 in this example).
We then apply CoGroupByKey and produce PCollection to be output to GCS.
Two PCollections are produced after the CoGroupByKey step depending on the number of values associated with each key (it's a contrived example, but it is to demonstrate the issue we are experiencing) -- whether the number is even or odd.
So one of the PCs will contain keys "k1, " and "k3" (with some value strings appended to them, see the code below) as they have one value each and the other will contain a single key "k2" as it has two values (found in each file).
These two PCs are written to GCS at different locations, and the flattened PC of the two will also be written to GCS (but it could have been further transformed).
The three output files are expected to contain the following contents (rows may not be in order):
output1:
k2: [v2],(w2)
output2:
k3: (w3)
k1: [v1]
outputMerged:
k3: (w3)
k2: [v2],(w2)
k1: [v1]
This is exactly what we see (and expected)in Dataflow SDK 1.9.
In 2.0 and 2.1 however, output1 and output2 come out to be empty (and the TextIO steps are not even executed as if there are no elements being input to them; we verified this by adding a dummy ParDo in-between, and it's not invoked at all).
This makes us very curious as to why suddenly this behavior change was made between 1.9 and 2.0/2.1, and what would be the best way for us to achieve what we have been doing with 1.9.
Specifically, we produce output1/2 for archiving purposes, while we flatten the two PCs to transform the data further and produce another output.
Here is Java Code you can run (you will have to import properly, change the bucket name, and set Options properly, etc.).
Working code for 1.9:
//Dataflow SDK 1.9 compatible.
public class TestJob {
public static void execute(Options options) {
Pipeline pipeline = Pipeline.create(options);
PCollection<KV<String, String>> data1 =
pipeline.apply(TextIO.Read.from(GcsPath.EXPERIMENT_BUCKET + "/data1.txt")).apply(ParDo.of(new doFn()));
PCollection<KV<String, String>> data2 =
pipeline.apply(TextIO.Read.from(GcsPath.EXPERIMENT_BUCKET + "/data2.txt")).apply(ParDo.of(new doFn()));
TupleTag<String> inputTag1 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
TupleTag<String> inputTag2 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
TupleTag<String> outputTag1 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
TupleTag<String> outputTag2 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
PCollectionTuple tuple = KeyedPCollectionTuple.of(inputTag1, data1).and(inputTag2, data2)
.apply(CoGroupByKey.<String>create()).apply(ParDo.of(new doFn2(inputTag1, inputTag2, outputTag2))
.withOutputTags(outputTag1, TupleTagList.of(outputTag2)));
PCollection<String> output1 = tuple.get(outputTag1);
PCollection<String> output2 = tuple.get(outputTag2);
PCollection<String> outputMerged = PCollectionList.of(output1).and(output2).apply(Flatten.<String>pCollections());
outputMerged.apply(TextIO.Write.to(GcsPath.EXPERIMENT_BUCKET + "/test-job-1.9/outputMerged").withNumShards(1));
output1.apply(TextIO.Write.to(GcsPath.EXPERIMENT_BUCKET + "/test-job-1.9/output1").withNumShards(1));
output2.apply(TextIO.Write.to(GcsPath.EXPERIMENT_BUCKET + "/test-job-1.9/output2").withNumShards(1));
pipeline.run();
}
static class doFn2 extends DoFn<KV<String, CoGbkResult>, String> {
private static final long serialVersionUID = 1L;
final TupleTag<String> inputTag1;
final TupleTag<String> inputTag2;
final TupleTag<String> outputTag2;
public doFn2(TupleTag<String> inputTag1, TupleTag<String> inputTag2, TupleTag<String> outputTag2) {
this.inputTag1 = inputTag1;
this.inputTag2 = inputTag2;
this.outputTag2 = outputTag2;
}
#ProcessElement
public void processElement(ProcessContext c) throws Exception {
String key = c.element().getKey();
List<String> values = new ArrayList<String>();
int numValues = 0;
for (String val1 : c.element().getValue().getAll(inputTag1)) {
values.add(String.format("[%s]", val1));
numValues++;
}
for (String val2 : c.element().getValue().getAll(inputTag2)) {
values.add(String.format("(%s)", val2));
numValues++;
}
final String line = String.format("%s: %s", key, Joiner.on(",").join(values));
if (numValues % 2 == 0) {
c.output(line);
} else {
c.sideOutput(outputTag2, line);
}
}
}
static class doFn extends DoFn<String, KV<String, String>> {
private static final long serialVersionUID = 1L;
#ProcessElement
public void processElement(ProcessContext c) throws Exception {
String[] tokens = c.element().split(",");
c.output(KV.of(tokens[0], tokens[1]));
}
}
}
Working Code for 2.0/2.1:
// Dataflow SDK 2.0 and 2.1 compatible.
public class TestJob {
public static void execute(Options options) {
Pipeline pipeline = Pipeline.create(options);
PCollection<KV<String, String>> data1 =
pipeline.apply(TextIO.read().from(GcsPath.EXPERIMENT_BUCKET + "/data1.txt")).apply(ParDo.of(new doFn()));
PCollection<KV<String, String>> data2 =
pipeline.apply(TextIO.read().from(GcsPath.EXPERIMENT_BUCKET + "/data2.txt")).apply(ParDo.of(new doFn()));
TupleTag<String> inputTag1 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
TupleTag<String> inputTag2 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
TupleTag<String> outputTag1 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
TupleTag<String> outputTag2 = new TupleTag<String>() {
private static final long serialVersionUID = 1L;
};
PCollectionTuple tuple = KeyedPCollectionTuple.of(inputTag1, data1).and(inputTag2, data2)
.apply(CoGroupByKey.<String>create()).apply(ParDo.of(new doFn2(inputTag1, inputTag2, outputTag2))
.withOutputTags(outputTag1, TupleTagList.of(outputTag2)));
PCollection<String> output1 = tuple.get(outputTag1);
PCollection<String> output2 = tuple.get(outputTag2);
PCollection<String> outputMerged = PCollectionList.of(output1).and(output2).apply(Flatten.<String>pCollections());
outputMerged.apply(TextIO.write().to(GcsPath.EXPERIMENT_BUCKET + "/test-job-2.1/outputMerged").withNumShards(1));
output1.apply(TextIO.write().to(GcsPath.EXPERIMENT_BUCKET + "/test-job-2.1/output1").withNumShards(1));
output2.apply(TextIO.write().to(GcsPath.EXPERIMENT_BUCKET + "/test-job-2.1/output2").withNumShards(1));
PipelineResult pipelineResult = pipeline.run();
pipelineResult.waitUntilFinish();
}
static class doFn2 extends DoFn<KV<String, CoGbkResult>, String> {
private static final long serialVersionUID = 1L;
final TupleTag<String> inputTag1;
final TupleTag<String> inputTag2;
final TupleTag<String> outputTag2;
public doFn2(TupleTag<String> inputTag1, TupleTag<String> inputTag2, TupleTag<String> outputTag2) {
this.inputTag1 = inputTag1;
this.inputTag2 = inputTag2;
this.outputTag2 = outputTag2;
}
#ProcessElement
public void processElement(ProcessContext c) throws Exception {
String key = c.element().getKey();
List<String> values = new ArrayList<String>();
int numValues = 0;
for (String val1 : c.element().getValue().getAll(inputTag1)) {
values.add(String.format("[%s]", val1));
numValues++;
}
for (String val2 : c.element().getValue().getAll(inputTag2)) {
values.add(String.format("(%s)", val2));
numValues++;
}
final String line = String.format("%s: %s", key, Joiner.on(",").join(values));
if (numValues % 2 == 0) {
c.output(line);
} else {
c.output(outputTag2, line);
}
}
}
static class doFn extends DoFn<String, KV<String, String>> {
private static final long serialVersionUID = 1L;
#ProcessElement
public void processElement(ProcessContext c) throws Exception {
String[] tokens = c.element().split(",");
c.output(KV.of(tokens[0], tokens[1]));
}
}
}
Also, in case it is useful, the execution graph looks like this.
(And for Google engineers, Job IDs are also specified).
With 1.9 (job id 2017-09-29_14_35_42-15149127992051688457):
With 2.1 (job id 2017-09-29_14_31_59-991964669451027883):
TextIO.Write 2,3 are not producing any output under 2.0/2.1.
Flatten, and its subsequent step works fine.

This is indeed a defect. A fix is in flight and should be documented as available in the Service Release Notes.
A workaround in the meantime is to use the 1.9.1 SDK, as this error only affects 2.x SDKs.
Users interested in picking up the fix early can also use the latest nightly build from Beam (recommended to unblock development, not for production, since it's a daily build). Instructions here.

Dataflow Map side-input issue

I'm having trouble creating a Map PCollectionView with the DataflowRunner.
The pipeline below aggregates an unbouded countingInput together with values from a side-input (containing 10 generated values).
When running the pipeline on gcp it get's stuck inside the View.asMap() transform.
More specifially, the ParDo(StreamingPCollectionViewWriter) does not have any output.
I tried this with dataflow 2.0.0-beta3, as well as with beam-0.7.0-SNAPSHOT, without any result. Note that my pipeline is running without any problem when using the local DirectRunner.
Am I doing something wrong?
All help is appreciated, thanks in advance for helping me out!
public class SimpleSideInputPipeline {
private static final Logger LOG = LoggerFactory.getLogger(SimpleSideInputPipeline.class);
public interface Options extends DataflowPipelineOptions {}
public static void main(String[] args) throws IOException {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
final PCollectionView<Map<Integer, String>> sideInput = pipeline
.apply(CountingInput.forSubrange(0L, 10L))
.apply("Create KV<Integer, String>",ParDo.of(new DoFn<Long, KV<Integer, String>>() {
#ProcessElement
public void processElement(ProcessContext c) {
c.output(KV.of(c.element().intValue(), "TEST"));
}
}))
.apply(View.asMap());
pipeline
.apply(CountingInput.unbounded().withRate(1, Duration.standardSeconds(5)))
.apply("Aggregate with side-input",ParDo.of(new DoFn<Long, KV<Long, String>>() {
#ProcessElement
public void processElement(ProcessContext c) {
Map<Integer, String> map = c.sideInput(sideInput);
//get first segment from map
Object[] values = map.values().toArray();
String firstVal = (String) values[0];
LOG.info("Combined: K: "+ c.element() + " V: " + firstVal + " MapSize: " + map.size());
c.output(KV.of(c.element(), firstVal));
}
}).withSideInputs(sideInput));
pipeline.run();
}
}

No need to worry that the ParDo(StreamingPCollectionViewWriterFn) does not record any output - what it does is actually write each element to an internal location.
You code looks OK to me, and this should be investigated. I have filed BEAM-2155.

Google Dataflow: Request payload size exceeds the limit: 10485760 bytes

when trying to run a large transform on ~ 800.000 files, I get the above error message when trying to run the pipeline.
Here is the code:
public static void main(String[] args) {
Pipeline p = Pipeline.create(
PipelineOptionsFactory.fromArgs(args).withValidation().create());
GcsUtil u = getUtil(p.getOptions());
try{
List<GcsPath> paths = u.expand(GcsPath.fromUri("gs://tlogdataflow/stage/*.zip"));
List<String> strPaths = new ArrayList<String>();
for(GcsPath pa: paths){
strPaths.add(pa.toUri().toString());
}
p.apply(Create.of(strPaths))
.apply("Unzip Files", Write.to(new ZipIO.Sink("gs://tlogdataflow/outbox")));
p.run();
}
catch(IOException io){
//
}
}
I thought thats exactly what google data flow is for? Handling large amounts of files / data?
Is there a way to split the load in order to make it work?
Thanks & BR
Phil

Dataflow is good at handling large amounts of data, but has limitations in terms of how large the description of the pipeline can be. Data passed to Create.of() is currently embedded in the pipeline description, so you can't pass very large amounts of data there - instead, large amounts of data should be read from external storage, and the pipeline should specify only their locations.
Think of it as the distinction between the amount of data a program can process vs. the size of the program's code itself.
You can get around this issue by making the expansion happen in a ParDo:
p.apply(Create.of("gs://tlogdataflow/stage/*.zip"))
.apply(ParDo.of(new ExpandFn()))
.apply(...fusion break (see below)...)
.apply(Write.to(new ZipIO.Sink("gs://tlogdataflow/outbox")))
where ExpandFn is something like as follows:
private static class ExpandFn extends DoFn<String, String> {
#ProcessElement
public void process(ProcessContext c) {
GcsUtil util = getUtil(c.getPipelineOptions());
for (String path : util.expand(GcsPath.fromUri(c.element()))) {
c.output(path);
}
}
}
and by fusion break I'm referring to this (basically, ParDo(add unique key) + group by key + Flatten.iterables() + Values.create()). It's not very convenient and there are discussions happening about adding a built-in transform to do this (see this PR and this thread).

Thank you very much! Using your input I solved it like this:
public class ZipPipeline {
private static final Logger LOG = LoggerFactory.getLogger(ZipPipeline.class);
public static void main(String[] args) {
Pipeline p = Pipeline.create(
PipelineOptionsFactory.fromArgs(args).withValidation().create());
try{
p.apply(Create.of("gs://tlogdataflow/stage/*.zip"))
.apply(ParDo.of(new ExpandFN()))
.apply(ParDo.of(new AddKeyFN()))
.apply(GroupByKey.<String,String>create())
.apply(ParDo.of(new FlattenFN()))
.apply("Unzip Files", Write.to(new ZipIO.Sink("gs://tlogdataflow/outbox")));
p.run();
}
catch(Exception e){
LOG.error(e.getMessage());
}
}
private static class FlattenFN extends DoFn<KV<String,Iterable<String>>, String>{
private static final long serialVersionUID = 1L;
#Override
public void processElement(ProcessContext c){
KV<String,Iterable<String>> kv = c.element();
for(String s: kv.getValue()){
c.output(s);
}
}
}
private static class ExpandFN extends DoFn<String,String>{
private static final long serialVersionUID = 1L;
#Override
public void processElement(ProcessContext c) throws Exception{
GcsUtil u = getUtil(c.getPipelineOptions());
for(GcsPath path : u.expand(GcsPath.fromUri(c.element()))){
c.output(path.toUri().toString());
}
}
}
private static class AddKeyFN extends DoFn<String, KV<String,String>>{
private static final long serialVersionUID = 1L;
#Override
public void processElement(ProcessContext c){
String path = c.element();
String monthKey = path.split("_")[4].substring(0, 6);
c.output(KV.of(monthKey, path));
}
}

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Beam : Not Event with SideInput into streaming pipeline with DataflowRunner - google-cloud-dataflow

Related

Problem while implementing join of two dataset in google cloud dataflow using Apache Beam

Unit test hangs forever if `DoFn` resets event timers

Unexpected behavior change from dataflow 1.9 to 2.0/2.1

Dataflow Map side-input issue

Google Dataflow: Request payload size exceeds the limit: 10485760 bytes

Categories

Resources