Reduce hanging when trying to sum up grouped values - project-reactor

I'm trying use a Project Reactor chain set up to collect and group values to finally sum them up by group. The collection is split into two parts and blocking.
In a simplified example I'm able to reproduce the problem. First I gather some generic data in createWrappers() which reads data from a the network (blocking calls). As data is retrieved objects are emitted. in the second step details are gathered from a different blocking network location and that information is added to the wrapper part. Then data gets transformed into a list of details, grouped by the details key and finally summed up by details key. In the end a map should be produced which looks like this (values are specific for the testcase):
key value
------------------
detail-0 1000
detail-1 2000
detail-2 3000
...
As soon as I add the block() to the reduce() part everything hangs in the sample code below:
import org.junit.jupiter.api.Test;
import reactor.core.publisher.Flux;
import reactor.core.publisher.FluxSink;
import reactor.core.scheduler.Schedulers;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class TestBlockingIssue
{
#Test
public void testBlockingMap()
{
final Flux<Wrapper> source = Flux.create( sink -> createWrappers( 1000, sink ) );
final Map<String, BigDecimal> block = source.parallel( 10 ).runOn( Schedulers.boundedElastic() )
.map( wrapper -> enhanceWrapper( wrapper, 100 ) )
.flatMap( wrapper -> Flux.fromIterable( wrapper.detailsList ) )
.sequential()
.groupBy( details -> details.detailKey )
.cache()
.collectMap( group -> group.key(), group -> group.reduce( new BigDecimal( 0 ), ( x, y ) -> x.add( y.value ) ).block() ).block();
System.out.println( block );
}
private Wrapper enhanceWrapper( final Wrapper wrapper, final int count )
{
for ( int i = 0; i < count; i++ )
{
wrapper.detailsList.add( new Details( "detail-" + i, new BigDecimal( i +1 ) ) );
}
return wrapper;
}
private void createWrappers( final int count, final FluxSink<Wrapper> sink )
{
for ( int i = 0; i < count; i++ )
{
sink.next( new Wrapper( "Wrapper-" + i ) );
}
sink.complete();
}
private class Details
{
final String detailKey;
final BigDecimal value;
private Details( final String detailKey, final BigDecimal value )
{
this.detailKey = detailKey;
this.value = value;
}
}
private class Wrapper
{
final String lookupKey;
final List<Details> detailsList = new ArrayList<>();
private Wrapper( final String lookupKey )
{
this.lookupKey = lookupKey;
}
}
}
How could I resolve the issue with the hanging chain or which alternatives do I have to generate the map?

This occurs when using groupBy with too much groups and downstream isn't fast enough to consume the group. In your sample you should not block in the collect map but you should consume the group before collecting like:
final Map<String, BigDecimal> block = source.parallel( 10 ).runOn( Schedulers.boundedElastic() )
.map( wrapper -> enhanceWrapper( wrapper, 100 ) )
.flatMap( wrapper -> Flux.fromIterable( wrapper.detailsList ) )
.sequential()
.groupBy( details -> details.detailKey )
.cache()
.flatMap(g -> g.reduce( new BigDecimal( 0 ), ( x, y ) -> x.add( y.value ) ).map(v -> Tuples.of(g.key(), v)))
.collectMap(Tuple2::getT1, Tuple2::getT2)
.block();
So now downstream is fast enough but you might need to adjust concurrency depending on the number of group. And be sure that you have a low number of groups.

Related

How to change the final type after reduction of a downstream collector in a Java 8 stream?

I got a legacy application using data structures like those in the following toy snippet and I can't easily change these data structures.
I use a Java 8 (only) stream to do some stats and I failed to get the wished type using Collectors.
package myIssueWithCollector;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BinaryOperator;
import java.util.stream.Collectors;
public class MyIssueWithCollector {
public static Double latitude(Map<String, String> map) {
String latitude = map.get("LATITUDE");
return Double.valueOf(latitude);
}
private static int latitudeComparator(double d1, double d2) {
// get around the fact that NaN > +Infinity in Double.compare()
if (Double.isNaN(d1) && !Double.isNaN(d2)) {
return -1;
}
if (!Double.isNaN(d1) && Double.isNaN(d2)) {
return 1;
}
return Double.compare(Math.abs(d1), Math.abs(d2));
}
public static Map<String, String> createMap(String city, String country, String continent, String latitude) {
Map<String, String> map = new HashMap<>();
map.put("CITY", city);
map.put("COUNTRY", country);
map.put("CONTINENT", continent);
map.put("LATITUDE", latitude);
return map;
}
public static void main(String[] args) {
// Cities with dummies latitudes
// I can not change easily these legacy data structures
Map<String, String> map1 = createMap("London", "UK", "Europa", "48.1");
Map<String, String> map2 = createMap("New York", "USA", "America", "42.4");
Map<String, String> map3 = createMap("Miami", "USA", "America", "39.1");
Map<String, String> map4 = createMap("Glasgow", "UK", "Europa", "49.2");
Map<String, String> map5 = createMap("Camelot", "UK", "Europa", "NaN");
List<Map<String, String>> maps = new ArrayList<>(4);
maps.add(map1);
maps.add(map2);
maps.add(map3);
maps.add(map4);
maps.add(map5);
//////////////////////////////////////////////////////////////////
// My issue starts here:
//////////////////////////////////////////////////////////////////
Map<String, Map<String, Double>> result = maps.stream()
.collect(Collectors.groupingBy(m -> m.get("CONTINENT"),
Collectors.groupingBy(m -> m.get("COUNTRY"), Collectors.reducing(Double.NaN, m -> latitude(m),
BinaryOperator.maxBy((d1, d2) -> latitudeComparator(d1, d2))))));
System.out.println(result);
}
}
I need the result type to be
Map<String, Map<String, String>> instead of Map<String, Map<String, Double>>
by converting back "LATITUDE" from Double to String (using a custom format, not Double.toString() ).
I failed to achieve this with Collectors methods like andThen or collectingAndThen,...
I am currently stuck with Java 8.
Is there a way to get a Map<String, Map<String, String>> result using the same stream ?
Instead of using Collectors.reducing(…) with BinaryOperator.maxBy(…) you can also use Collectors.maxBy. Since this collector doesn’t support an identity value, it requires a finisher function to extract the value from an Optional, but your task requires a finisher anyway, to format the value.
Map<String, Map<String,String>> result = maps.stream()
.collect(Collectors.groupingBy(m -> m.get("CONTINENT"),
Collectors.groupingBy(m -> m.get("COUNTRY"),
Collectors.mapping(MyIssueWithCollector::latitude,
Collectors.collectingAndThen(
Collectors.maxBy(MyIssueWithCollector::latitudeComparator),
o -> format(o.get()))))));
This assumes format to be your custom format function like
private static String format(double d) {
return String.format("%.2f", d);
}
But sometimes, it might be worthwhile to implement your own collector instead of combining multiple built-in collectors.
Map<String, Map<String,String>> result = maps.stream()
.collect(Collectors.groupingBy(m -> m.get("CONTINENT"),
Collectors.groupingBy(m -> m.get("COUNTRY"),
Collector.of(
() -> new double[]{Double.NEGATIVE_INFINITY},
(a, m) -> {
double d = latitude(m);
if(!Double.isNaN(d)) a[0] = Double.max(a[0], d);
},
(a, b) -> a[0] >= b[0]? a: b,
a -> format(a[0])))));
A collector maintains its state using a mutable container, this custom collector uses an array of length one to be able to hold a double value (which eliminates the need to box it to Double objects). Instead of implementing a special comparator to treat NaN specially, it uses a conditional, to never let NaN get into the array in the first place. That’s why the combiner doesn’t need to care about NaN; it can simply return the larger of the two values.
The finisher function just invokes the custom format function with the double value.
You can use Collectors.collectingAndThen to convert the reduced double value to a corresponding String:
Map<String, Map<String, String>> result = maps.stream().collect(
Collectors.groupingBy(
m -> m.get("CONTINENT"),
Collectors.groupingBy(
m -> m.get("COUNTRY"),
Collectors.collectingAndThen(
Collectors.reducing(
Double.NaN,
m -> latitude(m),
BinaryOperator.maxBy(
(d1, d2) -> latitudeComparator(d1, d2)
)
),
MyIssueWithCollector::myToString
)
)
)
);
Here, myToString is some method in the MyIssueWithCollector class to return String from double with your custom format, for example,
public static String myToString(double d) {
return "[latitude=" + d + "]";
}
Using Collectors reducing, you can maintain the latitude's String type in the identity so that the downstream collector is returning a String.
Map < String, Map < String, String >> result = maps.stream()
.collect(
Collectors.groupingBy(m - > m.get("CONTINENT"),
Collectors.groupingBy(m - > m.get("COUNTRY"),
Collectors.reducing("NaN", m - > m.get("LATITUDE"),
BinaryOperator.maxBy((s1, s2) - > latitudeComparator(Double.valueOf(s1), Double.valueOf(s2)))))));

How to combine different mono and use the combined result with error handling?

I have a scenario where i need to use different mono which could return me errors and set map values to null if error is returned.
Ex:
Mono<A> a=Some api call;
Mono<A> b=Some api giving error;
Mono<A> c=Some api call;
Now i want to set the resulting response to map
Map<String,A> m=new HashMap<>();
m.put("a",a);
m.put("b",null);
m.put("c",c);
Can anyone help on how to do all this in reactive non blocking way.
I tried zip but it will not execute if any of the api return error or if i use onErrorReturn(null).
Thanks in advance
To solve your problems, you will have to use some tricks. The problem is that :
Giving an empty mono or mono that ends in error cancel zip operation (source: Mono#zip javadoc)
Reactive streams do not allow null values (source: Reactive stream spec, table 2: Subscribers, bullet 13)
Also, note that putting a null value in a hash map is the same as cancelling any previous value associated with the key (it's important in case you're updating an existing map).
Now, to bypass your problem, you can add an abstraction layer, and wrap your values in domain objects.
You can have an object that represents a query, another a valid result, and the last one will mirror an error.
With that, you can design publishers that will always succeed with non null values.
That's a technic used a lot in functional programming : common errors are part of the (one possible) result value.
Now, let's see the example that create a new Map from multiple Monos:
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import java.time.Duration;
import java.util.Map;
public class BypassMonoError {
/**
* An object identified by a key. It serves to track which key to associate to computed values
* #param <K> Type of the key
*/
static class Identified<K> {
protected final K id;
Identified(K id) {
this.id = id;
}
public K getId() {
return id;
}
}
/**
* Describe the result value of an operation, along with the key associated to it.
*
* #param <K> Type of the identifier of the result
* #param <V> Value type
*/
static abstract class Result<K, V> extends Identified<K> {
Result(K id) {
super(id);
}
/**
*
* #return Computed value on success, or null if the operation has failed. Note that here, we cannot tell from
* a success returning a null value or an error
*/
abstract V getOrNull();
}
static final class Success<K, V> extends Result<K, V> {
private final V value;
Success(K id, V value) {
super(id);
this.value = value;
}
#Override
V getOrNull() {
return value;
}
}
static final class Error<K, V> extends Result<K, V> {
private final Exception error;
Error(K id, Exception error) {
super(id);
this.error = error;
}
#Override
V getOrNull() {
return null;
}
public Exception getError() {
return error;
}
}
/**
* A request that can asynchronously generate a result for the associated identifier.
*/
static class Query<K, V> extends Identified<K> {
private final Mono<V> worker;
Query(K id, Mono<V> worker) {
super(id);
this.worker = worker;
}
/**
* #return The operator that computes the result value. Note that any error is silently wrapped in an
* {#link Error empty result with error metadata}.
*/
public Mono<Result<K, V>> runCatching() {
return worker.<Result<K, V>>map(success -> new Success<>(id, success))
.onErrorResume(Exception.class, error -> Mono.just(new Error<K, V>(id, error)));
}
}
public static void main(String[] args) {
final Flux<Query<String, String>> queries = Flux.just(
new Query("a", Mono.just("A")),
new Query("b", Mono.error(new Exception("B"))),
new Query("c", Mono.delay(Duration.ofSeconds(1)).map(v -> "C"))
);
final Flux<Result<String, String>> results = queries.flatMap(query -> query.runCatching());
final Map<String, String> myMap = results.collectMap(Result::getId, Result::getOrNull)
.block();
for (Map.Entry<String, String> entry : myMap.entrySet()) {
System.out.printf("%s -> %s%n", entry.getKey(), entry.getValue());
}
}
}
Note : In the above example, we silently ignore any occurred error. However, when using the flux, you can test if a result is an error, and if it is, you are free to design your own error management (log, fail-first, send in another flux, etc.).
This outputs:
a -> A
b -> null
c -> C

How to Batch By N Elements in Streaming Pipeline With Small Bundles?

I've implemented batching by N elements as described in this answer:
Can datastore input in google dataflow pipeline be processed in a batch of N entries at a time?
package com.example.dataflow.transform;
import com.example.dataflow.event.ClickEvent;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
import org.joda.time.Instant;
import java.util.ArrayList;
import java.util.List;
public class ClickToClicksPack extends DoFn> {
public static final int BATCH_SIZE = 10;
private List accumulator;
#StartBundle
public void startBundle() {
accumulator = new ArrayList(BATCH_SIZE);
}
#ProcessElement
public void processElement(ProcessContext c) {
ClickEvent clickEvent = c.element();
accumulator.add(clickEvent);
if (accumulator.size() >= BATCH_SIZE) {
c.output(accumulator);
accumulator = new ArrayList(BATCH_SIZE);
}
}
#FinishBundle
public void finishBundle(FinishBundleContext c) {
if (accumulator.size() > 0) {
ClickEvent clickEvent = accumulator.get(0);
long time = clickEvent.getClickTimestamp().getTime();
c.output(accumulator, new Instant(time), GlobalWindow.INSTANCE);
}
}
}
But when I run pipeline in streaming mode there are a lot of batches with just 1 or 2 elements. As I understand it's because of small bundles size. After running for a day average number of elements in batch is roughly 4. I really need it to be closer to 10 for better performance of the next steps.
Is there a way to control bundles size?
Or should I use "GroupIntoBatches" transform for this purpose. In this case it's not clear for me, what should be selected as a key.
UPDATE:
is it a good idea to use java thread id or VM hostname for a key to apply "GroupIntoBatches" transform?
I've ended up doing composite transform with "GroupIntoBatches" inside.
The following answer contains recommendations regarding key selection:
https://stackoverflow.com/a/44956702/4888849
In my current implementation I'm using random keys to achieve parallelism and I'm windowing events in order to emit results regularly even if there are less then BATCH_SIZE events by one key.
package com.example.dataflow.transform;
import com.example.dataflow.event.ClickEvent;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.GroupIntoBatches;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.windowing.FixedWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.joda.time.Duration;
import java.util.Random;
/**
* Batch clicks into packs of BATCH_SIZE size
*/
public class ClickToClicksPack extends PTransform, PCollection>> {
public static final int BATCH_SIZE = 10;
// Define window duration.
// After window's end - elements are emitted even if there are less then BATCH_SIZE elements
public static final int WINDOW_DURATION_SECONDS = 1;
private static final int DEFAULT_SHARDS_NUMBER = 20;
// Determine possible parallelism level
private int shardsNumber = DEFAULT_SHARDS_NUMBER;
public ClickToClicksPack() {
super();
}
public ClickToClicksPack(int shardsNumber) {
super();
this.shardsNumber = shardsNumber;
}
#Override
public PCollection> expand(PCollection input) {
return input
// assign keys, as "GroupIntoBatches" works only with key-value pairs
.apply(ParDo.of(new AssignRandomKeys(shardsNumber)))
.apply(Window.into(FixedWindows.of(Duration.standardSeconds(WINDOW_DURATION_SECONDS))))
.apply(GroupIntoBatches.ofSize(BATCH_SIZE))
.apply(ParDo.of(new ExtractValues()));
}
/**
* Assigns to clicks random integer between zero and shardsNumber
*/
private static class AssignRandomKeys extends DoFn> {
private int shardsNumber;
private Random random;
AssignRandomKeys(int shardsNumber) {
super();
this.shardsNumber = shardsNumber;
}
#Setup
public void setup() {
random = new Random();
}
#ProcessElement
public void processElement(ProcessContext c) {
ClickEvent clickEvent = c.element();
KV kv = KV.of(random.nextInt(shardsNumber), clickEvent);
c.output(kv);
}
}
/**
* Extract values from KV
*/
private static class ExtractValues extends DoFn>, Iterable> {
#ProcessElement
public void processElement(ProcessContext c) {
KV> kv = c.element();
c.output(kv.getValue());
}
}
}

Recommendation Engine using Apache Spark MLIB showing up Zero recommendations after processing all operations

I am a newbie when it comes to Implementation of ML Algorithms. I wanted to implement a recommendation Engine and Got to know after little experimenting that collaborative-filtering can be used for the same. I am using Apache Spark for the same. I got help from one of the blogs and tried to implement the same in my local. PFB Code that I tried out. Every time I execute this the Count of Recommendations that is getting printed is always zero. I don see any Evident Error as such. Could someone please help me understand this. Also, please feel free to provide any other reference that can be referred in this regard.
package mllib.example;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.mllib.recommendation.ALS;
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
import org.apache.spark.mllib.recommendation.Rating;
import scala.Tuple2;
public class RecommendationEngine {
public static void main(String[] args) {
// Create Java spark context
SparkConf conf = new SparkConf().setAppName("Recommendation System Example").setMaster("local[2]").set("spark.executor.memory","1g");
JavaSparkContext sc = new JavaSparkContext(conf);
// Read user-item rating file. format - userId,itemId,rating
JavaRDD<String> userItemRatingsFile = sc.textFile(args[0]);
System.out.println("Count is "+userItemRatingsFile.count());
// Read item description file. format - itemId, itemName, Other Fields,..
JavaRDD<String> itemDescritpionFile = sc.textFile(args[1]);
System.out.println("itemDescritpionFile Count is "+itemDescritpionFile.count());
// Map file to Ratings(user,item,rating) tuples
JavaRDD<Rating> ratings = userItemRatingsFile.map(new Function<String, Rating>() {
public Rating call(String s) {
String[] sarray = s.split(",");
return new Rating(Integer.parseInt(sarray[0]), Integer
.parseInt(sarray[1]), Double.parseDouble(sarray[2]));
}
});
System.out.println("Ratings RDD Object"+ratings.first().toString());
// Create tuples(itemId,ItemDescription), will be used later to get names of item from itemId
JavaPairRDD<Integer,String> itemDescritpion = itemDescritpionFile.mapToPair(
new PairFunction<String, Integer, String>() {
#Override
public Tuple2<Integer, String> call(String t) throws Exception {
String[] s = t.split(",");
return new Tuple2<Integer,String>(Integer.parseInt(s[0]), s[1]);
}
});
System.out.println("itemDescritpion RDD Object"+ratings.first().toString());
// Build the recommendation model using ALS
int rank = 10; // 10 latent factors
int numIterations = Integer.parseInt(args[2]); // number of iterations
MatrixFactorizationModel model = ALS.trainImplicit(JavaRDD.toRDD(ratings),
rank, numIterations);
//ALS.trainImplicit(arg0, arg1, arg2)
// Create user-item tuples from ratings
JavaRDD<Tuple2<Object, Object>> userProducts = ratings
.map(new Function<Rating, Tuple2<Object, Object>>() {
public Tuple2<Object, Object> call(Rating r) {
return new Tuple2<Object, Object>(r.user(), r.product());
}
});
// Calculate the itemIds not rated by a particular user, say user with userId = 1
JavaRDD<Integer> notRatedByUser = userProducts.filter(new Function<Tuple2<Object,Object>, Boolean>() {
#Override
public Boolean call(Tuple2<Object, Object> v1) throws Exception {
if (((Integer) v1._1).intValue() != 0) {
return true;
}
return false;
}
}).map(new Function<Tuple2<Object,Object>, Integer>() {
#Override
public Integer call(Tuple2<Object, Object> v1) throws Exception {
return (Integer) v1._2;
}
});
// Create user-item tuples for the items that are not rated by user, with user id 1
JavaRDD<Tuple2<Object, Object>> itemsNotRatedByUser = notRatedByUser
.map(new Function<Integer, Tuple2<Object, Object>>() {
public Tuple2<Object, Object> call(Integer r) {
return new Tuple2<Object, Object>(0, r);
}
});
// Predict the ratings of the items not rated by user for the user
JavaRDD<Rating> recomondations = model.predict(itemsNotRatedByUser.rdd()).toJavaRDD().distinct();
// Sort the recommendations by rating in descending order
recomondations = recomondations.sortBy(new Function<Rating,Double>(){
#Override
public Double call(Rating v1) throws Exception {
return v1.rating();
}
}, false, 1);
System.out.println("recomondations Total is "+recomondations.count());
// Get top 10 recommendations
JavaRDD<Rating> topRecomondations = sc.parallelize(recomondations.take(10));
// Join top 10 recommendations with item descriptions
JavaRDD<Tuple2<Rating, String>> recommendedItems = topRecomondations.mapToPair(
new PairFunction<Rating, Integer, Rating>() {
#Override
public Tuple2<Integer, Rating> call(Rating t) throws Exception {
return new Tuple2<Integer,Rating>(t.product(),t);
}
}).join(itemDescritpion).values();
System.out.println("recommendedItems count is "+recommendedItems.count());
//Print the top recommendations for user 1.
recommendedItems.foreach(new VoidFunction<Tuple2<Rating,String>>() {
#Override
public void call(Tuple2<Rating, String> t) throws Exception {
System.out.println(t._1.product() + "\t" + t._1.rating() + "\t" + t._2);
}
});
Also, I see that this job is Running for real Long time. Every time it creates a model.Is there a way I can Create the Model once, persist it and Load the same for consecutive Predictions. Can we by any chance improve the Speed of execution of this job
Thanks in Advance

Stateful ParDo not working on Dataflow Runner

Based on Javadocs and the blog post at https://beam.apache.org/blog/2017/02/13/stateful-processing.html, I tried using a simple de-duplication example using 2.0.0-beta-2 SDK which reads a file from GCS (containing a list of jsons each with a user_id field) and then running it through a pipeline as explained below.
The input data contains about 146K events of which only 50 events are unique. The entire input is about 50MB which should be processable in considerably less time than the 2 min Fixed window. I just placed a window there to make sure the per-key-per-window semantics hold without using a GlobalWindow. I run the windowed data through 3 parallel stages to compare the results, each of which are explained below.
just copies the contents into a new file on GCS - this ensures all the events were being processed as expected and I verified the contents are exactly the same as input
Combine.PerKey on the user_id and pick only the first element from the Iterable - this essentially should deduplicate the data and it works as expected. The resulting file has the exact number of unique items from the original list of events - 50 elements
stateful ParDo which checks if the key has been seen already and emits an output only when its not. Ideally, the result from this should match the deduped data as [2] but all I am seeing is only 3 unique events. These 3 unique events always point to the same 3 user_ids in a few runs I did.
Interestingly, when I just switch from the DataflowRunner to the DirectRunner running this whole process locally, I see that the output from [3] matches [2] having only 50 unique elements as expected. So, I am doubting if there are any issues with the DataflowRunner for the Stateful ParDo.
public class StatefulParDoSample {
private static Logger logger = LoggerFactory.getLogger(StatefulParDoSample.class.getName());
static class StatefulDoFn extends DoFn<KV<String, String>, String> {
final Aggregator<Long, Long> processedElements = createAggregator("processed", Sum.ofLongs());
final Aggregator<Long, Long> skippedElements = createAggregator("skipped", Sum.ofLongs());
#StateId("keyTracker")
private final StateSpec<Object, ValueState<Integer>> keyTrackerSpec =
StateSpecs.value(VarIntCoder.of());
#ProcessElement
public void processElement(
ProcessContext context,
#StateId("keyTracker") ValueState<Integer> keyTracker) {
processedElements.addValue(1l);
final String userId = context.element().getKey();
int wasSeen = firstNonNull(keyTracker.read(), 0);
if (wasSeen == 0) {
keyTracker.write( 1);
context.output(context.element().getValue());
} else {
keyTracker.write(wasSeen + 1);
skippedElements.addValue(1l);
}
}
}
public static void main(String[] args) {
DataflowPipelineOptions pipelineOptions = PipelineOptionsFactory.create().as(DataflowPipelineOptions.class);
pipelineOptions.setRunner(DataflowRunner.class);
pipelineOptions.setProject("project-name");
pipelineOptions.setStagingLocation(GCS_STAGING_LOCATION);
pipelineOptions.setStreaming(false);
pipelineOptions.setAppName("deduper");
Pipeline p = Pipeline.create(pipelineOptions);
final ObjectMapper mapper = new ObjectMapper();
PCollection<KV<String, String>> keyedEvents =
p
.apply(TextIO.Read.from(GCS_SAMPLE_INPUT_FILE_PATH))
.apply(WithKeys.of(new SerializableFunction<String, String>() {
#Override
public String apply(String input) {
try {
Map<String, Object> eventJson =
mapper.readValue(input, Map.class);
return (String) eventJson.get("user_id");
} catch (Exception e) {
}
return "";
}
}))
.apply(
Window.into(
FixedWindows.of(Duration.standardMinutes(2))
)
);
keyedEvents
.apply(ParDo.of(new StatefulDoFn()))
.apply(TextIO.Write.to(GCS_SAMPLE_OUTPUT_FILE_PATH).withNumShards(1));
keyedEvents
.apply(Values.create())
.apply(TextIO.Write.to(GCS_SAMPLE_COPY_FILE_PATH).withNumShards(1));
keyedEvents
.apply(Combine.perKey(new SerializableFunction<Iterable<String>, String>() {
#Override
public String apply(Iterable<String> input) {
return !input.iterator().hasNext() ? "empty" : input.iterator().next();
}
}))
.apply(Values.create())
.apply(TextIO.Write.to(GCS_SAMPLE_COMBINE_FILE_PATH).withNumShards(1));
PipelineResult result = p.run();
result.waitUntilFinish();
}
}
This was a bug in the Dataflow service in batch mode, fixed in the upcoming 0.6.0 Beam release (or HEAD if you track the bleeding edge).
Thank you for bringing it to my attention! For reference, or if anything else comes up, this was tracked by BEAM-1611.

Resources