Custom Batch filter in weka - machine-learning

I am trying to build a custom batch filter that extends SimpleBatchFilter. However, I am experiencing the problem of running it second time to get an inverted output. Here is the relevant code and the error I am getting after both runs are completed:
Exception in thread "main" java.lang.IndexOutOfBoundsException: Index: 79, Size: 79
at java.util.ArrayList.rangeCheck(ArrayList.java:653)
at java.util.ArrayList.get(ArrayList.java:429)
at weka.core.Attribute.addStringValue(Attribute.java:994)
at weka.core.StringLocator.copyStringValues(StringLocator.java:155)
at weka.core.StringLocator.copyStringValues(StringLocator.java:91)
at weka.filters.Filter.copyValues(Filter.java:373)
at weka.filters.Filter.push(Filter.java:290)
at weka.filters.SimpleBatchFilter.batchFinished(SimpleBatchFilter.java:266)
at weka.filters.Filter.useFilter(Filter.java:667)
at likeability.Main.main(Main.java:30)
And here is the relevant code:
public class TestFilter extends SimpleBatchFilter {
private Attribute a;
private Attribute b;
private int sampleSizePercent = 15;
private boolean invert = false;
private int seed = 1;
#Override
protected Instances process(Instances inst) throws Exception {
ArrayList<Instances> partitionsA = partition(inst, a);
ArrayList<Instances> partitions = new ArrayList<Instances>();
for(Instances data: partitionsA) {
partitions.addAll(partition(data, b));
}
return getTestSet(partitions);
}
/*
* Partitions the data so that there's only one nominal value of the
* attribute a in one partition.
*/
private ArrayList<Instances> partition(Instances data, Attribute att) throws Exception {
ArrayList<Instances> instances = new ArrayList<Instances>();
for (int i = 0; i < att.numValues(); i++){
RemoveWithValues rm = new RemoveWithValues();
rm.setAttributeIndex(Integer.toString(att.index()+1));
rm.setInvertSelection(true);
rm.setNominalIndices(Integer.toString(i+1));
rm.setInputFormat(data);
instances.add(Filter.useFilter(data, rm));
}
return instances;
}
private Instances getTestSet(List<Instances> insts) throws Exception {
Instances output = new Instances(insts.get(0), 0);
for(Instances inst: insts) {
Resample filter = new Resample();
filter.setRandomSeed(seed);
filter.setNoReplacement(true);
filter.setInvertSelection(invert);
filter.setSampleSizePercent(sampleSizePercent);
filter.setInputFormat(inst);
Instances curr = Filter.useFilter(inst, filter);
System.out.println(inst.size() + " " + curr.size());
output.addAll(curr);
}
return output;
}
#Override
protected Instances determineOutputFormat(Instances arg) throws Exception {
return new Instances(arg, 0);
}
#Override
public String globalInfo() {
return "A filter which partitions the data so that each partition contains"
+ " only instances with one value of attribute a and b, then takes "
+ "a random subset of values from each partition and merges them to"
+ " produce the final set.";
}
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.enableAllAttributes();
result.enableAllClasses();
result.enable(Capability.NO_CLASS); // filter doesn't need class to be set
return result;
}
//Main and getters and setters
}
And this is how I call it:
TestFilter filter = new TestFilter();
filter.setA(data.attribute("gender"));
filter.setB(data.attribute("age"));
filter.setInputFormat(data);
Instances test = Filter.useFilter(data, filter);
filter.setInvert(true);
filter.setInputFormat(data);
Instances train = Filter.useFilter(data, filter);
It seems to me quite stupid that I would need to use those two lines between the calls. I suspect I should use isBatchFinished(), does it mean I have to implement it extending BatchFilter rather then SimpleBatchFilter? It would be also helpful to see some successful implementations, since the only ones I could find where the ones in the WEKA manual.

I solved it by extending a Filter instead and changing the process function to batchFinished(). I am posting this answer as I have not found a custom filter example anywhere else.
#Override
public boolean batchFinished() throws Exception {
if(isFirstBatchDone()) {
invert = true;
}
if (getInputFormat() == null)
throw new NullPointerException("No input instance format defined");
Instances inst = getInputFormat();
ArrayList<Instances> partitionsA = partition(inst, a);
ArrayList<Instances> partitions = new ArrayList<Instances>();
for(Instances data: partitionsA) {
partitions.addAll(partition(data, b));
}
private void getTestSet(List<Instances> insts) throws Exception {
for(Instances inst: insts) {
Resample filter = new Resample();
filter.setRandomSeed(seed);
filter.setNoReplacement(true);
filter.setInvertSelection(invert);
filter.setSampleSizePercent(sampleSizePercent);
filter.setInputFormat(inst);
Instances curr = Filter.useFilter(inst, filter);
System.out.println(inst.size() + " " + curr.size());
curr.forEach((i) -> push(i));
}
}
#Override
public boolean setInputFormat(Instances arg) throws Exception {
super.setInputFormat(arg);
Instances outputFormat = new Instances(arg, 0);
setOutputFormat(outputFormat);
return true;
}

Related

Recommendation Engine using Apache Spark MLIB showing up Zero recommendations after processing all operations

I am a newbie when it comes to Implementation of ML Algorithms. I wanted to implement a recommendation Engine and Got to know after little experimenting that collaborative-filtering can be used for the same. I am using Apache Spark for the same. I got help from one of the blogs and tried to implement the same in my local. PFB Code that I tried out. Every time I execute this the Count of Recommendations that is getting printed is always zero. I don see any Evident Error as such. Could someone please help me understand this. Also, please feel free to provide any other reference that can be referred in this regard.
package mllib.example;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.mllib.recommendation.ALS;
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
import org.apache.spark.mllib.recommendation.Rating;
import scala.Tuple2;
public class RecommendationEngine {
public static void main(String[] args) {
// Create Java spark context
SparkConf conf = new SparkConf().setAppName("Recommendation System Example").setMaster("local[2]").set("spark.executor.memory","1g");
JavaSparkContext sc = new JavaSparkContext(conf);
// Read user-item rating file. format - userId,itemId,rating
JavaRDD<String> userItemRatingsFile = sc.textFile(args[0]);
System.out.println("Count is "+userItemRatingsFile.count());
// Read item description file. format - itemId, itemName, Other Fields,..
JavaRDD<String> itemDescritpionFile = sc.textFile(args[1]);
System.out.println("itemDescritpionFile Count is "+itemDescritpionFile.count());
// Map file to Ratings(user,item,rating) tuples
JavaRDD<Rating> ratings = userItemRatingsFile.map(new Function<String, Rating>() {
public Rating call(String s) {
String[] sarray = s.split(",");
return new Rating(Integer.parseInt(sarray[0]), Integer
.parseInt(sarray[1]), Double.parseDouble(sarray[2]));
}
});
System.out.println("Ratings RDD Object"+ratings.first().toString());
// Create tuples(itemId,ItemDescription), will be used later to get names of item from itemId
JavaPairRDD<Integer,String> itemDescritpion = itemDescritpionFile.mapToPair(
new PairFunction<String, Integer, String>() {
#Override
public Tuple2<Integer, String> call(String t) throws Exception {
String[] s = t.split(",");
return new Tuple2<Integer,String>(Integer.parseInt(s[0]), s[1]);
}
});
System.out.println("itemDescritpion RDD Object"+ratings.first().toString());
// Build the recommendation model using ALS
int rank = 10; // 10 latent factors
int numIterations = Integer.parseInt(args[2]); // number of iterations
MatrixFactorizationModel model = ALS.trainImplicit(JavaRDD.toRDD(ratings),
rank, numIterations);
//ALS.trainImplicit(arg0, arg1, arg2)
// Create user-item tuples from ratings
JavaRDD<Tuple2<Object, Object>> userProducts = ratings
.map(new Function<Rating, Tuple2<Object, Object>>() {
public Tuple2<Object, Object> call(Rating r) {
return new Tuple2<Object, Object>(r.user(), r.product());
}
});
// Calculate the itemIds not rated by a particular user, say user with userId = 1
JavaRDD<Integer> notRatedByUser = userProducts.filter(new Function<Tuple2<Object,Object>, Boolean>() {
#Override
public Boolean call(Tuple2<Object, Object> v1) throws Exception {
if (((Integer) v1._1).intValue() != 0) {
return true;
}
return false;
}
}).map(new Function<Tuple2<Object,Object>, Integer>() {
#Override
public Integer call(Tuple2<Object, Object> v1) throws Exception {
return (Integer) v1._2;
}
});
// Create user-item tuples for the items that are not rated by user, with user id 1
JavaRDD<Tuple2<Object, Object>> itemsNotRatedByUser = notRatedByUser
.map(new Function<Integer, Tuple2<Object, Object>>() {
public Tuple2<Object, Object> call(Integer r) {
return new Tuple2<Object, Object>(0, r);
}
});
// Predict the ratings of the items not rated by user for the user
JavaRDD<Rating> recomondations = model.predict(itemsNotRatedByUser.rdd()).toJavaRDD().distinct();
// Sort the recommendations by rating in descending order
recomondations = recomondations.sortBy(new Function<Rating,Double>(){
#Override
public Double call(Rating v1) throws Exception {
return v1.rating();
}
}, false, 1);
System.out.println("recomondations Total is "+recomondations.count());
// Get top 10 recommendations
JavaRDD<Rating> topRecomondations = sc.parallelize(recomondations.take(10));
// Join top 10 recommendations with item descriptions
JavaRDD<Tuple2<Rating, String>> recommendedItems = topRecomondations.mapToPair(
new PairFunction<Rating, Integer, Rating>() {
#Override
public Tuple2<Integer, Rating> call(Rating t) throws Exception {
return new Tuple2<Integer,Rating>(t.product(),t);
}
}).join(itemDescritpion).values();
System.out.println("recommendedItems count is "+recommendedItems.count());
//Print the top recommendations for user 1.
recommendedItems.foreach(new VoidFunction<Tuple2<Rating,String>>() {
#Override
public void call(Tuple2<Rating, String> t) throws Exception {
System.out.println(t._1.product() + "\t" + t._1.rating() + "\t" + t._2);
}
});
Also, I see that this job is Running for real Long time. Every time it creates a model.Is there a way I can Create the Model once, persist it and Load the same for consecutive Predictions. Can we by any chance improve the Speed of execution of this job
Thanks in Advance

Dataflow DynamicDestinations unable to serialize org.apache.beam.sdk.io.gcp.bigquery.PrepareWrite

I am trying to use DynamicDestinations to write to a partitioned table in BigQuery where the partition name is mytable$yyyyMMdd. If I bypass dynamicdestinations and supply a hardcoded table name in .to(), it works; however, with dynamicdestinations I get the following exception:
java.lang.IllegalArgumentException: unable to serialize org.apache.beam.sdk.io.gcp.bigquery.PrepareWrite$1#6fff253c
at org.apache.beam.sdk.util.SerializableUtils.serializeToByteArray(SerializableUtils.java:53)
at org.apache.beam.sdk.util.SerializableUtils.clone(SerializableUtils.java:90)
at org.apache.beam.sdk.transforms.ParDo$SingleOutput.<init>(ParDo.java:591)
at org.apache.beam.sdk.transforms.ParDo.of(ParDo.java:435)
at org.apache.beam.sdk.io.gcp.bigquery.PrepareWrite.expand(PrepareWrite.java:51)
at org.apache.beam.sdk.io.gcp.bigquery.PrepareWrite.expand(PrepareWrite.java:36)
at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:514)
at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:473)
at org.apache.beam.sdk.values.PCollection.apply(PCollection.java:297)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO$Write.expandTyped(BigQueryIO.java:987)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO$Write.expand(BigQueryIO.java:972)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO$Write.expand(BigQueryIO.java:659)
at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:514)
at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:454)
at org.apache.beam.sdk.values.PCollection.apply(PCollection.java:284)
at com.homedepot.payments.monitoring.eventprocessor.MetricsAggregator.main(MetricsAggregator.java:82)
Caused by: java.io.NotSerializableException: com.google.api.services.bigquery.model.TableReference
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1184)
And here is the code:
PCollection<Event> rawEvents = pipeline
.apply("ReadFromPubSub",
PubsubIO.readProtos(EventOuterClass.Event.class)
.fromSubscription(OPTIONS.getSubscription())
)
.apply("Parse", ParDo.of(new ParseFn()))
.apply("ExtractAttributes", ParDo.of(new ExtractAttributesFn()));
EventTable table = new EventTable(OPTIONS.getProjectId(), OPTIONS.getMetricsDatasetId(), OPTIONS.getRawEventsTable());
rawEvents.apply(BigQueryIO.<Event>write()
.to(new DynamicDestinations<Event, String>() {
private static final long serialVersionUID = 1L;
#Override
public TableSchema getSchema(String destination) {
return table.schema();
}
#Override
public TableDestination getTable(String destination) {
return new TableDestination(table.reference(), null);
}
#Override
public String getDestination(ValueInSingleWindow<Event> element) {
String dayString = DateTimeFormat.forPattern("yyyyMMdd").withZone(DateTimeZone.UTC).toString();
return table.reference().getTableId() + "$" + dayString;
}
})
.withFormatFunction(new SerializableFunction<Event, TableRow>() {
public TableRow apply(Event event) {
TableRow row = new TableRow();
Event evnt = (Event) event;
row.set(EventTable.Field.VERSION.getName(), evnt.getVersion());
row.set(EventTable.Field.TIMESTAMP.getName(), evnt.getTimestamp() / 1000);
row.set(EventTable.Field.EVENT_TYPE_ID.getName(), evnt.getEventTypeId());
row.set(EventTable.Field.EVENT_ID.getName(), evnt.getId());
row.set(EventTable.Field.LOCATION.getName(), evnt.getLocation());
row.set(EventTable.Field.SERVICE.getName(), evnt.getService());
row.set(EventTable.Field.HOST.getName(), evnt.getHost());
row.set(EventTable.Field.BODY.getName(), evnt.getBody());
return row;
}
})
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
);
Any pointers in the correct direction would be greatly appreciated.
Thanks!
From inspecting the exception message and the code above, it seems that the EventTable field used within your anonymous DynamicDestinations class contains a TableReference field which is not serializable.
One workaround would be to convert the anonymous DynamicDestinations to a static inner class and define a constructor which stores only the serializable pieces of the EventTable needed to implement the interface.
For example:
private static class EventDestinations extends DynamicDestinations<Event, String> {
private final TableSchema schema;
private final TableDestination destination;
private final String tableId;
private EventDestinations(EventTable table) {
this.schema = table.schema();
this.destination = new TableDestination(table.reference(), null);
this.tableId = table.reference().getTableId();
}
// ..
}
Looks like you're trying to fill a specific partition based on the event. Why not use:
SerializableFunction<ValueInSingleWindow<Event>, TableDestination>?

handleURI for http://AAA.BBB.CCC.DDD:8080/myapp/ uri: '' returns ambigious result (Vaadin 6)

In my Vaadin 6 application I sometimes get the following error:
SEVERE: Terminal error:
java.lang.RuntimeException: handleURI for http://AAA.BBB.CCC.DDD:8080/myapp/ uri: '' returns ambigious result.
at com.vaadin.ui.Window.handleURI(Window.java:432)
at com.vaadin.terminal.gwt.server.AbstractCommunicationManager.handleURI(AbstractCommunicationManager.java:2291)
at com.vaadin.terminal.gwt.server.CommunicationManager.handleURI(CommunicationManager.java:370)
at com.vaadin.terminal.gwt.server.AbstractApplicationServlet.handleURI(AbstractApplicationServlet.java:1099)
at com.vaadin.terminal.gwt.server.AbstractApplicationServlet.service(AbstractApplicationServlet.java:535)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:728)
Accrording to Vaadin source it occurs in the following method:
public DownloadStream handleURI(URL context, String relativeUri) {
DownloadStream result = null;
if (uriHandlerList != null) {
Object[] handlers;
synchronized (uriHandlerList) {
handlers = uriHandlerList.toArray();
}
for (int i = 0; i < handlers.length; i++) {
final DownloadStream ds = ((URIHandler) handlers[i]).handleURI(
context, relativeUri);
if (ds != null) {
if (result != null) {
throw new RuntimeException("handleURI for " + context
+ " uri: '" + relativeUri
+ "' returns ambigious result.");
}
result = ds;
}
}
}
return result;
}
I actually create a DownloadStream in a column generator (in order to display images in a table):
public class ImageColumnGenerator implements Table.ColumnGenerator {
private static final Logger LOGGER = LoggerFactory.getLogger(ImageColumnGenerator.class);
public final static String IMAGE_FIELD = "image";
public Object generateCell(final Table aTable, final Object aItemId, final Object aColumnId) {
if (!IMAGE_FIELD.equals(aColumnId)) {
return null;
}
final BeanItem<UserProductImageBean> beanItem = (BeanItem<UserProductImageBean>)
aTable.getItem(aItemId);
final StreamResource streamResource = new StreamResource(new StreamResource.StreamSource() {
public InputStream getStream() {
return new ByteArrayInputStream(beanItem.getBean().getImageData());
}
},
beanItem.getBean().getFileName(),
MyApplication.getInstance());
LOGGER.debug("imageResource: " + streamResource);
final Embedded embedded = new Embedded("", streamResource);
return embedded;
}
}
beanItem.getBean().getImageData() is a byte array (byte[]) with image data, which I get from a web service.
MyApplication.getInstance() is defined as follows:
public class MyApplication extends Application implements ApplicationContext.TransactionListener
{
private static ThreadLocal<MyApplication> currentApplication =
new ThreadLocal<MyApplication> ();
public static MyApplication getInstance()
{
return currentApplication.get ();
}
}
What can I do in order to fix the aforementioned (severe) error?
As soon as nobody answer. I'm not at all expert in what hell it is above, but - try to find out on what kind of urls this error arise on, and do with them something before feed them to DownloadStream

Accessing a Service from within an XNA Content Pipeline Extension

I need to allow my content pipeline extension to use a pattern similar to a factory. I start with a dictionary type:
public delegate T Mapper<T>(MapFactory<T> mf, XElement d);
public class MapFactory<T>
{
Dictionary<string, Mapper<T>> map = new Dictionary<string, Mapper<T>>();
public void Add(string s, Mapper<T> m)
{
map.Add(s, m);
}
public T Get(XElement xe)
{
if (xe == null) throw new ArgumentNullException(
"Invalid document");
var key = xe.Name.ToString();
if (!map.ContainsKey(key)) throw new ArgumentException(
key + " is not a valid key.");
return map[key](this, xe);
}
public IEnumerable<T> GetAll(XElement xe)
{
if (xe == null) throw new ArgumentNullException(
"Invalid document");
foreach (var e in xe.Elements())
{
var val = e.Name.ToString();
if (map.ContainsKey(val))
yield return map[val](this, e);
}
}
}
Here is one type of object I want to store:
public partial class TestContent
{
// Test type
public string title;
// Once test if true
public bool once;
// Parameters
public Dictionary<string, object> args;
public TestContent()
{
title = string.Empty;
args = new Dictionary<string, object>();
}
public TestContent(XElement xe)
{
title = xe.Name.ToString();
args = new Dictionary<string, object>();
xe.ParseAttribute("once", once);
}
}
XElement.ParseAttribute is an extension method that works as one might expect. It returns a boolean that is true if successful.
The issue is that I have many different types of tests, each of which populates the object in a way unique to the specific test. The element name is the key to MapFactory's dictionary. This type of test, while atypical, illustrates my problem.
public class LogicTest : TestBase
{
string opkey;
List<TestBase> items;
public override bool Test(BehaviorArgs args)
{
if (items == null) return false;
if (items.Count == 0) return false;
bool result = items[0].Test(args);
for (int i = 1; i < items.Count; i++)
{
bool other = items[i].Test(args);
switch (opkey)
{
case "And":
result &= other;
if (!result) return false;
break;
case "Or":
result |= other;
if (result) return true;
break;
case "Xor":
result ^= other;
break;
case "Nand":
result = !(result & other);
break;
case "Nor":
result = !(result | other);
break;
default:
result = false;
break;
}
}
return result;
}
public static TestContent Build(MapFactory<TestContent> mf, XElement xe)
{
var result = new TestContent(xe);
string key = "Or";
xe.GetAttribute("op", key);
result.args.Add("key", key);
var names = mf.GetAll(xe).ToList();
if (names.Count() < 2) throw new ArgumentException(
"LogicTest requires at least two entries.");
result.args.Add("items", names);
return result;
}
}
My actual code is more involved as the factory has two dictionaries, one that turns an XElement into a content type to write and another used by the reader to create the actual game objects.
I need to build these factories in code because they map strings to delegates. I have a service that contains several of these factories. The mission is to make these factory classes available to a content processor. Neither the processor itself nor the context it uses as a parameter have any known hooks to attach an IServiceProvider or equivalent.
Any ideas?
I needed to create a data structure essentially on demand without access to the underlying classes as they came from a third party, in this case XNA Game Studio. There is only one way to do this I know of... statically.
public class TestMap : Dictionary<string, string>
{
private static readonly TestMap map = new TestMap();
private TestMap()
{
Add("Logic", "LogicProcessor");
Add("Sequence", "SequenceProcessor");
Add("Key", "KeyProcessor");
Add("KeyVector", "KeyVectorProcessor");
Add("Mouse", "MouseProcessor");
Add("Pad", "PadProcessor");
Add("PadVector", "PadVectorProcessor");
}
public static TestMap Map
{
get { return map; }
}
public IEnumerable<TestContent> Collect(XElement xe, ContentProcessorContext cpc)
{
foreach(var e in xe.Elements().Where(e => ContainsKey(e.Name.ToString())))
{
yield return cpc.Convert<XElement, TestContent>(
e, this[e.Name.ToString()]);
}
}
}
I took this a step further and created content processors for each type of TestBase:
/// <summary>
/// Turns an imported XElement into a TestContent used for a LogicTest
/// </summary>
[ContentProcessor(DisplayName = "LogicProcessor")]
public class LogicProcessor : ContentProcessor<XElement, TestContent>
{
public override TestContent Process(XElement input, ContentProcessorContext context)
{
var result = new TestContent(input);
string key = "Or";
input.GetAttribute("op", key);
result.args.Add("key", key);
var items = TestMap.Map.Collect(input, context);
if (items.Count() < 2) throw new ArgumentNullException(
"LogicProcessor requires at least two items.");
result.args.Add("items", items);
return result;
}
}
Any attempt to reference or access the class such as calling TestMap.Collect will generate the underlying static class if needed. I basically moved the code from LogicTest.Build to the processor. I also carry out any needed validation in the processor.
When I get to reading these classes I will have the ContentService to help.

Method to create and store method chain at runtime

The problem I have is that I need to do about 40+ conversions to convert loosely typed info into strongly typed info stored in db, xml file, etc.
I'm plan to tag each type with a tuple i.e. a transformational form like this:
host.name.string:host.dotquad.string
which will offer a conversion from the input to an output form. For example, the name stored in the host field of type string, the input is converted into a dotquad notation of type string and stored back into host field. More complex conversions may need several steps, with each step being accomplished by a method call, hence method chaining.
Examining further the example above, the tuple 'host.name.string' with the field host of name www.domain.com. A DNS lookup is done to covert domain name to IP address. Another method is applied to change the type returned by the DNS lookup into the internal type of dotquad of type string. For this transformation, there is 4 seperate methods called to convert from one tuple into another. Some other conversions may require more steps.
Ideally I would like an small example of how method chains are constructed at runtime. Development time method chaining is relatively trivial, but would require pages and pages of code to cover all possibilites, with 40+ conversions.
One way I thought of doing is, is parsing the tuples at startup, and writing the chains out to an assembly, compiling it, then using reflection to load/access. Its would be really ugly and negate the performance increases i'm hoping to gain.
I'm using Mono, so no C# 4.0
Any help would be appreciated.
Bob.
Here is a quick and dirty solution using LINQ Expressions. You have indicated that you want C# 2.0, this is 3.5, but it does run on Mono 2.6. The method chaining is a bit hacky as i didn't exactly know how your version works, so you might need to tweak the expression code to suit.
The real magic really happens in the Chainer class, which takes a collection of strings, which represent the MethodChain subclass. Take a collection like this:
{
"string",
"string",
"int"
}
This will generate a chain like this:
new StringChain(new StringChain(new IntChain()));
Chainer.CreateChain will return a lambda that calls MethodChain.Execute(). Because Chainer.CreateChain uses a bit of reflection, it's slow, but it only needs to run once for each expression chain. The execution of the lambda is nearly as fast as calling actual code.
Hope you can fit this into your architecture.
public abstract class MethodChain {
private MethodChain[] m_methods;
private object m_Result;
public MethodChain(params MethodChain[] methods) {
m_methods = methods;
}
public MethodChain Execute(object expression) {
if(m_methods != null) {
foreach(var method in m_methods) {
expression = method.Execute(expression).GetResult<object>();
}
}
m_Result = ExecuteInternal(expression);
return this;
}
protected abstract object ExecuteInternal(object expression);
public T GetResult<T>() {
return (T)m_Result;
}
}
public class IntChain : MethodChain {
public IntChain(params MethodChain[] methods)
: base(methods) {
}
protected override object ExecuteInternal(object expression) {
return int.Parse(expression as string);
}
}
public class StringChain : MethodChain {
public StringChain(params MethodChain[] methods):base(methods) {
}
protected override object ExecuteInternal(object expression) {
return (expression as string).Trim();
}
}
public class Chainer {
/// <summary>
/// methods are executed from back to front, so methods[1] will call method[0].Execute before executing itself
/// </summary>
/// <param name="methods"></param>
/// <returns></returns>
public Func<object, MethodChain> CreateChain(IEnumerable<string> methods) {
Expression expr = null;
foreach(var methodName in methods.Reverse()) {
ConstructorInfo cInfo= null;
switch(methodName.ToLower()) {
case "string":
cInfo = typeof(StringChain).GetConstructor(new []{typeof(MethodChain[])});
break;
case "int":
cInfo = typeof(IntChain).GetConstructor(new[] { typeof(MethodChain[]) });
break;
}
if(cInfo == null)
continue;
if(expr != null)
expr = Expression.New(cInfo, Expression.NewArrayInit( typeof(MethodChain), Expression.Convert(expr, typeof(MethodChain))));
else
expr = Expression.New(cInfo, Expression.Constant(null, typeof(MethodChain[])));
}
var objParam = Expression.Parameter(typeof(object));
var methodExpr = Expression.Call(expr, typeof(MethodChain).GetMethod("Execute"), objParam);
Func<object, MethodChain> lambda = Expression.Lambda<Func<object, MethodChain>>(methodExpr, objParam).Compile();
return lambda;
}
[TestMethod]
public void ExprTest() {
Chainer chainer = new Chainer();
var lambda = chainer.CreateChain(new[] { "int", "string" });
var result = lambda(" 34 ").GetResult<int>();
Assert.AreEqual(34, result);
}
}
The command pattern would fit here. What you could do is queue up commands as you need different operations performed on the different data types. Those messages could then all be processed and call the appropriate methods when you're ready later on.
This pattern can be implemented in .NET 2.0.
Do you really need to do this at execution time? Can't you create the combination of operations using code generation?
Let me elaborate:
Assuming you have a class called Conversions which contains all the 40+ convertions you mentioned like this:
//just pseudo code..
class conversions{
string host_name(string input){}
string host_dotquad(string input){}
int type_convert(string input){}
float type_convert(string input){}
float increment_float(float input){}
}
Write a simple console app or something similar which uses reflection to generate code for methods like this:
execute_host_name(string input, Queue<string> conversionQueue)
{
string ouput = conversions.host_name(input);
if(conversionQueue.Count == 0)
return output;
switch(conversionQueue.dequeue())
{
// generate case statements only for methods that take in
// a string as parameter because the host_name method returns a string.
case "host.dotquad": return execute_host_dotquad(output,conversionQueue);
case "type.convert": return execute_type_convert(output, conversionQueue);
default: // exception...
}
}
Wrap all this in a Nice little execute method like this:
object execute(string input, string [] conversions)
{
Queue<string> conversionQueue = //create the queue..
case(conversionQueue.dequeue())
{
case "host.name": return execute_host_name(output,conversionQueue);
case "host.dotquad": return execute_host_dotquad(output,conversionQueue);
case "type.convert": return execute_type_convert(output, conversionQueue);
default: // exception...
}
}
This code generation application need to be executed only when your method signatures changes or when you decide to add new transformations.
Main advantages:
No runtime overhead
Easy to add/delete/change the conversions (code generator will take care of the code changes :) )
What do you think?
I apologize for the long code dump and the fact that it is in Java, rather than C#, but I found your problem quite interesting and I do not have much C# experience. Hopefully you will be able to adapt this solution without difficulty.
One approach to solving your problem is to create a cost for each conversion -- usually this is related to the accuracy of the conversion -- and then perform a search to find the best possible conversion sequence to get from one type to another.
The reason for needing a cost function is to choose among multiple conversion paths. For example, converting from an integer to a string is lossless, but there is no guarantee that every string can be represented by an integer. So, if you had two conversion chains
string -> integer -> float -> decimal
string -> float -> decimal
You would want to select the second one because it will reduce the chance of a conversion failure.
The Java code below implements such a scheme and performs a best-first search to find an optimal conversion sequence. I hope you find it useful. Running the code produces the following output:
> No conversion possible from string to integer
> The optimal conversion sequence from string to host.dotquad.string is:
> string to host.name.string, cost = -1.609438
> host.name.string to host.dns, cost = -1.609438 *PERFECT*
> host.dns to host.dotquad, cost = -1.832581
> host.dotquad to host.dotquad.string, cost = -1.832581 *PERFECT*
Here is the Java code.
/**
* Use best-first search to find an optimal sequence of operations for
* performing a type conversion with maximum fidelity.
*/
import java.util.*;
public class TypeConversion {
/**
* Define a type-conversion interface. It converts between to
* user-defined types and provides a measure of fidelity (accuracy)
* of the conversion.
*/
interface ITypeConverter<T, F> {
public T convert(F from);
public double fidelity();
// Could use reflection instead of handling this explicitly
public String getSourceType();
public String getTargetType();
}
/**
* Create a set of user-defined types.
*/
class HostName {
public String hostName;
public HostName(String hostName) {
this.hostName = hostName;
}
}
class DnsLookup {
public String ipAddress;
public DnsLookup(HostName hostName) {
this.ipAddress = doDNSLookup(hostName);
}
private String doDNSLookup(HostName hostName) {
return "127.0.0.1";
}
}
class DottedQuad {
public int[] quad = new int[4];
public DottedQuad(DnsLookup lookup) {
String[] split = lookup.ipAddress.split(".");
for ( int i = 0; i < 4; i++ )
quad[i] = Integer.parseInt( split[i] );
}
}
/**
* Define a set of conversion operations between the types. We only
* implement a minimal number for brevity, but this could be expanded.
*
* We start by creating some broad classes to differentiate among
* perfect, good and bad conversions.
*/
abstract class PerfectTypeConversion<T, F> implements ITypeConverter<T, F> {
public abstract T convert(F from);
public double fidelity() { return 1.0; }
}
abstract class GoodTypeConversion<T, F> implements ITypeConverter<T, F> {
public abstract T convert(F from);
public double fidelity() { return 0.8; }
}
abstract class BadTypeConversion<T, F> implements ITypeConverter<T, F> {
public abstract T convert(F from);
public double fidelity() { return 0.2; }
}
/**
* Concrete classes that do the actual conversions.
*/
class StringToHostName extends BadTypeConversion<HostName, String> {
public HostName convert(String from) { return new HostName(from); }
public String getSourceType() { return "string"; }
public String getTargetType() { return "host.name.string"; }
}
class HostNameToDnsLookup extends PerfectTypeConversion<DnsLookup, HostName> {
public DnsLookup convert(HostName from) { return new DnsLookup(from); }
public String getSourceType() { return "host.name.string"; }
public String getTargetType() { return "host.dns"; }
}
class DnsLookupToDottedQuad extends GoodTypeConversion<DottedQuad, DnsLookup> {
public DottedQuad convert(DnsLookup from) { return new DottedQuad(from); }
public String getSourceType() { return "host.dns"; }
public String getTargetType() { return "host.dotquad"; }
}
class DottedQuadToString extends PerfectTypeConversion<String, DottedQuad> {
public String convert(DottedQuad f) {
return f.quad[0] + "." + f.quad[1] + "." + f.quad[2] + "." + f.quad[3];
}
public String getSourceType() { return "host.dotquad"; }
public String getTargetType() { return "host.dotquad.string"; }
}
/**
* To find the best conversion sequence, we need to instantiate
* a list of converters.
*/
ITypeConverter<?,?> converters[] =
{
new StringToHostName(),
new HostNameToDnsLookup(),
new DnsLookupToDottedQuad(),
new DottedQuadToString()
};
Map<String, List<ITypeConverter<?,?>>> fromMap =
new HashMap<String, List<ITypeConverter<?,?>>>();
public void buildConversionMap()
{
for ( ITypeConverter<?,?> converter : converters )
{
String type = converter.getSourceType();
if ( !fromMap.containsKey( type )) {
fromMap.put( type, new ArrayList<ITypeConverter<?,?>>());
}
fromMap.get(type).add(converter);
}
}
public class Tuple implements Comparable<Tuple>
{
public String type;
public double cost;
public Tuple parent;
public Tuple(String type, double cost, Tuple parent) {
this.type = type;
this.cost = cost;
this.parent = parent;
}
public int compareTo(Tuple o) {
return Double.compare( cost, o.cost );
}
}
public Tuple findOptimalConversionSequence(String from, String target)
{
PriorityQueue<Tuple> queue = new PriorityQueue<Tuple>();
// Add a dummy start node to the queue
queue.add( new Tuple( from, 0.0, null ));
// Perform the search
while ( !queue.isEmpty() )
{
// Pop the most promising candidate from the list
Tuple tuple = queue.remove();
// If the type matches the target type, return
if ( tuple.type == target )
return tuple;
// If we have reached a dead-end, backtrack
if ( !fromMap.containsKey( tuple.type ))
continue;
// Otherwise get all of the possible conversions to
// perform next and add their costs
for ( ITypeConverter<?,?> converter : fromMap.get( tuple.type ))
{
String type = converter.getTargetType();
double cost = tuple.cost + Math.log( converter.fidelity() );
queue.add( new Tuple( type, cost, tuple ));
}
}
// No solution
return null;
}
public static void convert(String from, String target)
{
TypeConversion tc = new TypeConversion();
// Build a conversion lookup table
tc.buildConversionMap();
// Find the tail of the optimal conversion chain.
Tuple tail = tc.findOptimalConversionSequence( from, target );
if ( tail == null ) {
System.out.println( "No conversion possible from " + from + " to " + target );
return;
}
// Reconstruct the conversion path (skip dummy node)
List<Tuple> solution = new ArrayList<Tuple>();
for ( ; tail.parent != null ; tail = tail.parent )
solution.add( tail );
Collections.reverse( solution );
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
sb.append( "The optimal conversion sequence from " + from + " to " + target + " is:\n" );
for ( Tuple tuple : solution ) {
formatter.format( "%20s to %20s, cost = %f", tuple.parent.type, tuple.type, tuple.cost );
if ( tuple.cost == tuple.parent.cost )
sb.append( " *PERFECT*");
sb.append( "\n" );
}
System.out.println( sb.toString() );
}
public static void main(String[] args)
{
// Run two tests
convert( "string", "integer" );
convert( "string", "host.dotquad.string" );
}
}

Resources