I am trying to write to Spanner from a DataFlow streaming job by using
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-io-google-cloud-platform</artifactId>
<version>2.18.0</version>
</dependency>
After mapping the data to PCollection<Mutation> I am writing them to Spanner via SpannerIO.write
Pipeline pipeline = Pipeline.create(options);
PCollection<Mutation> mutations = pipeline.apply...
mutations.apply("WriteMutations", SpannerIO.write()
.withInstanceId(INSTANCE_ID)
.withDatabaseId(DATABASE_ID)
);
pipeline.run();
However, it throws
java.lang.IllegalStateException: Sorter should be null here
at org.apache.beam.sdk.io.gcp.spanner.SpannerIO$GatherBundleAndSortFn.startBundle (SpannerIO.java:1080)
What would be the cause of this exception?
The following pipeline produces the exception. I test it with 20 workers but it looks like it's independent of the data load.
import com.google.cloud.spanner.Mutation;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage;
import org.apache.beam.sdk.io.gcp.spanner.SpannerIO;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.Repeatedly;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.joda.time.Duration;
import java.util.UUID;
public final class TestPipeline {
private static final Duration WINDOW_DURATION = Duration.standardSeconds(1);
private static final String DATABASE_ID = "test";
private static final String INSTANCE_ID = "test-spanner";
private static final String TEST_TABLE = "test";
public static void main(String[] args) {
TestPipelineOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(TestPipelineOptions.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("Read pubsub", PubsubIO.readMessagesWithAttributes()
.fromSubscription(options.getInputSubscription()))
.apply("Parse message", ParDo.of(new ProcessMessage()))
.apply("Windowing", Window.<Mutation>into(new GlobalWindows())
.triggering(Repeatedly.forever(
AfterProcessingTime.pastFirstElementInPane()
.plusDelayOf(WINDOW_DURATION)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes())
.apply("Write mutations", SpannerIO.write()
.withInstanceId(INSTANCE_ID)
.withDatabaseId(DATABASE_ID)
);
pipeline.run();
}
private static class ProcessMessage extends DoFn<PubsubMessage, Mutation> {
#ProcessElement
public void processElement(#Element final PubsubMessage message,
final OutputReceiver<Mutation> out) {
out.output(Mutation.newInsertOrUpdateBuilder(TEST_TABLE)
.set("id").to(UUID.randomUUID().toString())
.set("string").to("test")
.set("count").to(Long.MAX_VALUE)
.build()
);
}
}
interface TestPipelineOptions extends DataflowPipelineOptions {
void setInputSubscription(String inputSubscription);
#Description("Google Pubsub subscription id.")
String getInputSubscription();
}
}
Table CREATE TABLE test (id STRING(50) NOT NULL, string STRING(50) NOT NULL, count INT64) PRIMARY KEY (id);
This issue seems to occur with apache beam version 2.18, but not with version 2.17.
The issue with apache beam version 2.18 is tracked here: https://issues.apache.org/jira/browse/BEAM-9505
Related
I'm implementing the CombinePerKeyExample using a subclass of CombineFn instead of using an implementation of SerializableFunction.
package me.examples;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.transforms.Combine.CombineFn;
import java.util.HashSet;
import java.util.Set;
public class ConcatWordsCombineFn extends CombineFn<String, ConcatWordsCombineFn.Accumulator, String> {
#DefaultCoder(AvroCoder.class)
public static class Accumulator{
HashSet<String> plays;
}
#Override
public Accumulator createAccumulator(){
Accumulator accumulator = new Accumulator();
accumulator.plays = new HashSet<>();
return accumulator;
}
#Override
public Accumulator addInput(Accumulator accumulator, String input){
accumulator.plays.add(input);
return accumulator;
}
#Override
public Accumulator mergeAccumulators(Iterable<Accumulator> accumulators){
Accumulator mergeAccumulator = new Accumulator();
mergeAccumulator.plays = new HashSet<>();
for(Accumulator accumulator: accumulators){
mergeAccumulator.plays.addAll(accumulator.plays);
}
return mergeAccumulator;
}
#Override
public String extractOutput(Accumulator accumulator){
return String.join(",", accumulator.plays);
}
}
The pipeline is composed of a ReadFromBigQuery, ExtractAllPlaysOfWords (code below) and WriteToBigQuery
package me.examples;
import com.google.api.services.bigquery.model.TableRow;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.transforms.Combine;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
public class PlaysForWord extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
#Override
public PCollection<TableRow> expand(PCollection<TableRow> input) {
PCollection<KV<String, String>> largeWords = input.apply("ExtractLargeWords", ParDo.of(new ExtractLargeWordsFn()));
//PCollection<KV<String, String>> wordNPlays = largeWords.apply("CombinePlays", Combine.perKey(new ConcatWordsCombineFunction()));
//using CombineFn instead
PCollection<KV<String, String>> wordNPlays = largeWords.apply("CombinePlays",Combine.perKey(new ConcatWordsCombineFn()));
wordNPlays.setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
PCollection<TableRow> rows = wordNPlays.apply("FormatToRow", ParDo.of(new FormatShakespeareOutputFn()));
return rows;
}
}
If I'm not adding this line in the code above
wordNPlays.setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
I'm having an exception
Exception in thread "main" java.lang.IllegalStateException: Unable to return a default Coder for ExtractAllPlaysOfWords/CombinePlays/Combine.GroupedValues/ParDo(Anonymous)/ParMultiDo(Anonymous).output [PCollection]. Correct one of the following root causes:
No Coder has been manually specified; you may do so using .setCoder().
Inferring a Coder from the CoderRegistry failed: Cannot provide coder for parameterized type org.apache.beam.sdk.values.KV<K, OutputT>: Unable to provide a Coder for K.
Building a Coder using a registered CoderProvider failed.
See suppressed exceptions for detailed failures.
Using the default output Coder from the producing PTransform failed: PTransform.getOutputCoder called.
at org.apache.beam.vendor.guava.v20_0.com.google.common.base.Preconditions.checkState(Preconditions.java:444)
at org.apache.beam.sdk.values.PCollection.getCoder(PCollection.java:278)
at org.apache.beam.sdk.values.PCollection.finishSpecifying(PCollection.java:115)
at org.apache.beam.sdk.runners.TransformHierarchy.finishSpecifyingInput(TransformHierarchy.java:191)
at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:536)
at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:488)
at org.apache.beam.sdk.values.PCollection.apply(PCollection.java:370)
at me.examples.PlaysForWord.expand(PlaysForWord.java:21)
at me.examples.PlaysForWord.expand(PlaysForWord.java:10)
at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:537)
at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:488)
at org.apache.beam.sdk.values.PCollection.apply(PCollection.java:370)
at me.examples.Main.main(Main.java:41)
From the stacktrace, I think the pipeline is not able to get a coder for the String type of the KV obejct. Why is that ? Isn't supposed to be a "known" type for Apache Beam. Why is it working without specifying the coder when using the subclass of SerializableFunction in the Combine.perKey?
In addition to that, when I tried to get the default coder for String from the coder registry I get StringUTF8Coder
Coder coder = null;
try {
coder = pipeline.getCoderRegistry().getCoder(String.class);
logger.info("coder is " + coder);
} catch (Exception e){
logger.info("exception "+ e.getMessage() +"\n coder is " + coder );
}
/*result
INFO: coder is StringUtf8Coder
*/
I used Apache Beam 2.12.0 and run it on Google Dataflow
I was trying to implement SQL on two dataset on google cloud storage using apache beam by following Apache Beam documentation https://beam.apache.org/documentation/dsls/sql/walkthrough/
But i am ended with the below exception :
An exception occured while executing the Java class. org.apache.beam.sdk.transforms.MapElements
.via(Lorg/apache/beam/sdk/transforms/SimpleFunction;)Lorg/apache/beam/sdk/transforms/MapElements;
I tried changing Beam-sdk-version and other code changes but none of them worked .
package com.nitesh.gcp.feature;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.sql.SqlTransform;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.sdk.values.TupleTag;
import java.util.stream.Collectors;
public class beamSQL1 {
public static final String EMPHEADER = "empno,ename,job,mgr,hiredate,sal,comm,deptno";
public static final String DEPTHEADER = "deptno,dname,location";
public static final Schema EMPSCHEMA = Schema.builder()
.addStringField("empno")
.addStringField("ename")
.addStringField("job")
.addStringField("mgr")
.addStringField("hiredate")
.addStringField("sal")
.addStringField("comm")
.addStringField("deptno")
.build();
public static final Schema DEPTSCHEMA = Schema.builder()
.addStringField("deptno")
.addStringField("dname")
.addStringField("location")
.build();
public static void main(String[] args) {
PipelineOptionsFactory.register(DataflowPipelineOptions.class);
DataflowPipelineOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(DataflowPipelineOptions.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> employee = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/employee.txt"));
PCollection<String> department = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/department.txt"));
PCollection<Row> employeeRow = employee.apply("Transform To Row", ParDo.of(new RowParDo())).setRowSchema(EMPSCHEMA);
PCollection<Row> departmentRow = department.apply("Transform To Row", ParDo.of(new RowParDoForDept())).setRowSchema(DEPTSCHEMA);
PCollectionTuple output = PCollectionTuple.of(new TupleTag<>("emp"), employeeRow).and(new TupleTag<>("dept"), departmentRow);
output.apply(
SqlTransform.query(
// "SELECT emp.empno,emp.ename,dept.deptno,dept.dname FROM emp JOIN dept ON emp.deptno = dept.deptno"))
"SELECT * from emp JOIN dept ON emp.deptno = dept.deptno"))
/* p2.apply("Transform Sql", SqlTransform.query(
"SELECT * " +
"FROM PCOLLECTION order by sal desc LIMIT 14")
)*/
.apply("TransForm To String", ParDo.of(new RowToString()))
.apply("Write To GCS", TextIO.write().to("gs://amazon-test/sqlData/output/outputSql.csv").withoutSharding());
pipeline.run();
}
//ParDo for String -> Row (SQL)
public static class RowParDo extends DoFn<String, Row> {
#ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(EMPHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(EMPSCHEMA)
.addValues(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7])
.build();
c.output(appRow);
}
}
}
//ParDo for Row (SQL) -> String
public static class RowToString extends DoFn<Row, String> {
#ProcessElement
public void processElement(ProcessContext c) {
String line = c.element().getValues()
.stream()
.map(Object::toString)
.collect(Collectors.joining(","));
c.output(line);
}
}
//ParDo for String -> Row (SQL)
public static class RowParDoForDept extends DoFn<String, Row> {
#ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(DEPTHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(DEPTSCHEMA)
.addValues(vals[0], vals[1], vals[2])
.build();
c.output(appRow);
}
}
}
}
I am running a beam pipeline on Google Cloud Dataflow. however, the pipeline cannot be updated with exactly the same code. The pipeline looks like pipeline overview. And the code is as follows
import com.google.common.collect.Iterables;
import com.google.common.primitives.Ints;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.GenerateSequence;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PipelineTest {
private static final Logger logger = LoggerFactory.getLogger(PipelineTest.class);
public static void main(String[] args) {
int[] shit = new int[1000];
for (int i = 0; i < shit.length; i++) {
shit[i] = i * i;
}
PipelineOptions options = PipelineOptionsFactory.create();
Pipeline pipeline = Pipeline.create(options);
PCollection<Iterable<Integer>> sideInput =
pipeline.apply("Create", Create.<Iterable<Integer>>of(Ints.asList(shit)));
PCollectionView<Iterable<Integer>> view =
sideInput.apply("CreateSideInput", View.asSingleton());
PCollection<String> done =
pipeline
.apply(
"FakeData",
GenerateSequence.from(0).to(50_000).withRate(10, Duration.standardSeconds(1)))
.apply(
"Map1",
ParDo.of(
new DoFn<Long, String>() {
#ProcessElement
public void processElement(ProcessContext ctx) {
Long element = ctx.element();
Iterable<Integer> v = ctx.sideInput(view);
String out = "element " + element + ", value " + Iterables.size(v);
logger.info("MAP1: " + out);
ctx.output(out);
}
})
.withSideInputs(view))
.apply(
"Map2",
ParDo.of(
new DoFn<String, String>() {
#ProcessElement
public void processElement(ProcessContext ctx) {
String element = ctx.element();
Iterable<Integer> v = ctx.sideInput(view);
String out = "element " + element + ", value " + Iterables.size(v);
logger.info("MAP2: " + out);
ctx.output(out);
}
})
.withSideInputs(view));
}
}
I tried to provide default value for the view as well as use two views. However, neither of them work. If the view is used in two independent transforms, the pipeline can be updated.
I've been working on Kafka twitter streaming feed data.
I'm following the sample from below link:
http://www.hahaskills.com/tutorials/kafka/Twitter_doc.html
I'm able to use Producer code and it is working fine. Able to get twitter feed and send to Kafka Producer.
I'm not able to use Consumer code, since it has been throwing as deprecated error for many APIs.
Here is the Consumer code:
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import kafka.consumer.Consumer;
//import kafka.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import kafka.consumer.ConsumerIterator;
//import kafka.consumer.KafkaStream;
//import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
//import org.apache.kafka.clients.producer.KafkaProducer;
public class KafkaConsumer {
private final ConsumerConnector consumer;
private final String topic;
public KafkaConsumer(String zookeeper, String groupId, String topic) {
Properties props = new Properties();
props.put("zookeeper.connect", zookeeper);
props.put("group.id", groupId);
props.put("zookeeper.session.timeout.ms", "500");
props.put("zookeeper.sync.time.ms", "250");
props.put("auto.commit.interval.ms", "1000");
consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(props));
this.topic = topic;
}
public void testConsumer() {
System.out.println("Test Con called");
Map<String, Integer> topicCount = new HashMap<>();
topicCount.put(topic, 1);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerStreams = consumer.createMessageStreams(topicCount);
List<KafkaStream<byte[], byte[]>> streams = consumerStreams.get(topic);
System.out.println("For");
for (final KafkaStream stream : streams) {
ConsumerIterator<byte[], byte[]> it = stream.iterator();
System.out.println("Size"+it.length());
while (it.hasNext()) {
System.out.println("Stream");
System.out.println("Message from Single Topic: " + new String(it.next().message()));
}
}
if (consumer != null) {
consumer.shutdown();
}
}
public static void main(String[] args) {
System.out.println("Started");
String topic="twittertopic";
KafkaConsumer simpleTWConsumer = new KafkaConsumer("localhost:XXXX", "testgroup", topic);
simpleTWConsumer.testConsumer();
System.out.println("End");
}
}
It throws error : ConsumerConnector, ConsumerIterator, KafkaStream are deprecated.
ConsumerConfig is not visible.
Is there fixed version of this sample code (Kafka consumer for twitter)?
The tutorial you are following is very old and it's using the old Scala Kafka clients that have been deprecated, see http://kafka.apache.org/documentation/#legacyapis
The classes that have been deprecated are:
kafka.consumer.* and kafka.javaapi.consumer instead use the newer Java Consumer under org.apache.kafka.clients.consumer.*
kafka.producer.* and kafka.javaapi.producer instead use the newer Java Producer under org.apache.kafka.clients.producer.*
Apart from using deprecated classes, your code was mostly correct, I only had to fix a few imports. See below a fixed version. Using it I was able to consume messages I was producing to a topic called twittertopic.
package example;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import kafka.consumer.Consumer;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
public class MyConsumer {
private final ConsumerConnector consumer;
private final String topic;
public MyConsumer(String zookeeper, String groupId, String topic) {
Properties props = new Properties();
props.put("zookeeper.connect", zookeeper);
props.put("group.id", groupId);
props.put("zookeeper.session.timeout.ms", "500");
props.put("zookeeper.sync.time.ms", "250");
props.put("auto.commit.interval.ms", "1000");
consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(props));
this.topic = topic;
}
public void testConsumer() {
Map<String, Integer> topicCount = new HashMap<>();
topicCount.put(topic, 1);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerStreams = consumer.createMessageStreams(topicCount);
List<KafkaStream<byte[], byte[]>> streams = consumerStreams.get(topic);
for (final KafkaStream stream : streams) {
ConsumerIterator<byte[], byte[]> it = stream.iterator();
while (it.hasNext()) {
System.out.println("Message from Single Topic: " + new String(it.next().message()));
}
}
if (consumer != null) {
consumer.shutdown();
}
}
public static void main(String[] args) {
System.out.println("Started");
String topic = "twittertopic";
MyConsumer simpleTWConsumer = new MyConsumer("localhost:2181", "testgroup", topic);
simpleTWConsumer.testConsumer();
System.out.println("End");
}
}
While the code above can be used, the next major Kafka release is likely to remove classes that are currently deprecated, so you should not write new logic using these.
Instead you should get started with the Java clients, you can use the examples provided on Github: https://github.com/apache/kafka/tree/trunk/examples/src/main/java/kafka/examples
Using the new Java Consumer, your logic would look like:
import java.util.Arrays;
import java.util.Properties;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
public class MyConsumer {
static final String TOPIC = "twittertopic";
static final String GROUP = "testgroup";
public static void main(String[] args) {
System.out.println("Started");
Properties props = new Properties();
props.put("bootstrap.servers", "localhost:9092");
props.put("group.id", GROUP);
props.put("auto.commit.interval.ms", "1000");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
try (KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);) {
consumer.subscribe(Arrays.asList(TOPIC));
for (int i = 0; i < 1000; i++) {
ConsumerRecords<String, String> records = consumer.poll(Duration.ofSeconds(1L));
System.out.println("Size: " + records.count());
for (ConsumerRecord<String, String> record : records) {
System.out.println("Received a message: " + record.key() + " " + record.value());
}
}
}
System.out.println("End");
}
}
I downloaded the Jester example code in Mahout, and tries to run it on jester dataset to see the evaluation results. the running is done successfully, but the console only has the results:
log4j:WARN No appenders could be found for logger (org.apache.mahout.cf.taste.impl.model.file.FileDataModel).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
I expect to see the evaluation score range from 0 to 10. any one can help me found out how to get the score?
I am using mahout-core-0.6.jar and the following is the code:
JesterDataModel.java:
package Jester;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.regex.Pattern;
import com.google.common.collect.Lists;
import org.apache.mahout.cf.taste.example.grouplens.GroupLensDataModel;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
import org.apache.mahout.cf.taste.impl.model.GenericPreference;
import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.common.iterator.FileLineIterator;
//import org.apache.mahout.cf.taste.impl.common.FileLineIterable;
public final class JesterDataModel extends FileDataModel {
private static final Pattern COMMA_PATTERN = Pattern.compile(",");
private long userBeingRead;
public JesterDataModel() throws IOException {
this(GroupLensDataModel.readResourceToTempFile("\\jester-data-1.csv"));
}
public JesterDataModel(File ratingsFile) throws IOException {
super(ratingsFile);
}
#Override
public void reload() {
userBeingRead = 0;
super.reload();
}
#Override
protected DataModel buildModel() throws IOException {
FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>> ();
FileLineIterator iterator = new FileLineIterator(getDataFile(), false);
FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<FastByIDMap<Long>>();
processFile(iterator, data, timestamps, false);
return new GenericDataModel(GenericDataModel.toDataMap(data, true));
}
#Override
protected void processLine(String line,
FastByIDMap<?> rawData,
FastByIDMap<FastByIDMap<Long>> timestamps,
boolean fromPriorData) {
FastByIDMap<Collection<Preference>> data = (FastByIDMap<Collection<Preference>>) rawData;
String[] jokePrefs = COMMA_PATTERN.split(line);
int count = Integer.parseInt(jokePrefs[0]);
Collection<Preference> prefs = Lists.newArrayListWithCapacity(count);
for (int itemID = 1; itemID < jokePrefs.length; itemID++) { // yes skip first one, just a count
String jokePref = jokePrefs[itemID];
if (!"99".equals(jokePref)) {
float jokePrefValue = Float.parseFloat(jokePref);
prefs.add(new GenericPreference(userBeingRead, itemID, jokePrefValue));
}
}
data.put(userBeingRead, prefs);
userBeingRead++;
}
}
JesterRecommenderEvaluatorRunner.java
package Jester;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.eval.RecommenderEvaluator;
import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator;
import org.apache.mahout.cf.taste.model.DataModel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
public final class JesterRecommenderEvaluatorRunner {
private static final Logger log = LoggerFactory.getLogger(JesterRecommenderEvaluatorRunner.class);
private JesterRecommenderEvaluatorRunner() {
// do nothing
}
public static void main(String... args) throws IOException, TasteException {
RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator();
DataModel model = new JesterDataModel();
double evaluation = evaluator.evaluate(new JesterRecommenderBuilder(),
null,
model,
0.9,
1.0);
log.info(String.valueOf(evaluation));
}
}
Mahout 0.7 is old, and 0.6 is very old. Use at least 0.7, or better, later from SVN.
I think the problem is exactly what you identified: you don't have any slf4j bindings in your classpath. If you use the ".job" files in Mahout you will have all dependencies packages. Then you will actually see output.