Deeplearning4j LSTM output size - deeplearning4j

I my case - at input I have List<List<Float>> (list of word representation vectors). And - have one Double at output from one sequence.
So I building next structure (first index - example number, second - sentence item number, third - word vector element number) : http://pastebin.com/KGdjwnki
And in output : http://pastebin.com/fY8zrxEL
But when I masting one of next (http://pastebin.com/wvFFC4Hw) to model.output - I getting vector [0.25, 0.24, 0.25, 0.25], not one value.
What can be wrong? Attached code (at Kotlin). classCount is one.
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
import org.deeplearning4j.nn.conf.NeuralNetConfiguration.Builder
import org.deeplearning4j.nn.api.OptimizationAlgorithm
import org.deeplearning4j.nn.conf.Updater
import org.deeplearning4j.nn.weights.WeightInit
import org.deeplearning4j.nn.conf.layers.GravesLSTM
import org.deeplearning4j.nn.conf.layers.RnnOutputLayer
import org.deeplearning4j.nn.conf.BackpropType
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.cpu.nativecpu.NDArray
import org.nd4j.linalg.indexing.NDArrayIndex
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.lossfunctions.LossFunctions
import java.util.*
class ClassifierNetwork(wordVectorSize: Int, classCount: Int) {
data class Dimension(val x: Array<Int>, val y: Array<Int>)
val model: MultiLayerNetwork
val optimization = OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT
val iterations = 1
val learningRate = 0.1
val rmsDecay = 0.95
val seed = 12345
val l2 = 0.001
val weightInit = WeightInit.XAVIER
val updater = Updater.RMSPROP
val backtropType = BackpropType.TruncatedBPTT
val tbpttLength = 50
val epochs = 50
var dimensions = Dimension(intArrayOf(0).toTypedArray(), intArrayOf(0).toTypedArray())
init {
val baseConfiguration = Builder().optimizationAlgo(optimization)
.iterations(iterations).learningRate(learningRate).rmsDecay(rmsDecay).seed(seed).regularization(true).l2(l2)
.weightInit(weightInit).updater(updater)
.list()
baseConfiguration.layer(0, GravesLSTM.Builder().nIn(wordVectorSize).nOut(64).activation("tanh").build())
baseConfiguration.layer(1, GravesLSTM.Builder().nIn(64).nOut(32).activation("tanh").build())
baseConfiguration.layer(2, GravesLSTM.Builder().nIn(32).nOut(16).activation("tanh").build())
baseConfiguration.layer(3, RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MCXENT)
.activation("softmax").weightInit(WeightInit.XAVIER).nIn(16).nOut(classCount).build())
val cfg = baseConfiguration.build()!!
cfg.backpropType = backtropType
cfg.tbpttBackLength = tbpttLength
cfg.tbpttFwdLength = tbpttLength
cfg.isPretrain = false
cfg.isBackprop = true
model = MultiLayerNetwork(cfg)
}
private fun dataDimensions(x: List<List<Array<Double>>>, y: List<Array<Double>>): Dimension {
assert(x.size == y.size)
val exampleCount = x.size
assert(x.size > 0)
val sentenceLength = x[0].size
assert(sentenceLength > 0)
val wordVectorLength = x[0][0].size
assert(wordVectorLength > 0)
val classCount = y[0].size
assert(classCount > 0)
return Dimension(
intArrayOf(exampleCount, wordVectorLength, sentenceLength).toTypedArray(),
intArrayOf(exampleCount, classCount).toTypedArray()
)
}
data class Fits(val x: INDArray, val y: INDArray)
private fun fitConversion(x: List<List<Array<Double>>>, y: List<Array<Double>>): Fits {
val dim = dataDimensions(x, y)
val xItems = ArrayList<INDArray>()
for (i in 0..dim.x[0]-1) {
val itemList = ArrayList<DoubleArray>();
for (j in 0..dim.x[1]-1) {
var rowList = ArrayList<Double>()
for (k in 0..dim.x[2]-1) {
rowList.add(x[i][k][j])
}
itemList.add(rowList.toTypedArray().toDoubleArray())
}
xItems.add(Nd4j.create(itemList.toTypedArray()))
}
val xFits = Nd4j.create(xItems, dim.x.toIntArray(), 'c')
val yItems = ArrayList<DoubleArray>();
for (i in 0..y.size-1) {
yItems.add(y[i].toDoubleArray())
}
val yFits = Nd4j.create(yItems.toTypedArray())
return Fits(xFits, yFits)
}
private fun error(epoch: Int, x: List<List<Array<Double>>>, y: List<Array<Double>>) {
var totalDiff = 0.0
for (i in 0..x.size-1) {
val source = x[i]
val result = y[i]
val realResult = predict(source)
var diff = 0.0
for (j in 0..result.size-1) {
val elementDiff = result[j] - realResult[j]
diff += Math.pow(elementDiff, 2.0)
}
diff = Math.sqrt(diff)
totalDiff += Math.pow(diff, 2.0)
}
totalDiff = Math.sqrt(totalDiff)
print("Epoch ")
print(epoch)
print(", diff ")
println(totalDiff)
}
fun train(x: List<List<Array<Double>>>, y: List<Array<Double>>) {
dimensions = dataDimensions(x, y)
val(xFit, yFit) = fitConversion(x, y)
for (i in 0..epochs-1) {
model.input = xFit
model.labels = yFit
model.fit()
error(i+1, x, y)
}
}
fun predict(x: List<Array<Double>>): Array<Double> {
val xList = ArrayList<DoubleArray>();
for (i in 0..dimensions.x[1]-1) {
var row = ArrayList<Double>()
for (j in 0..dimensions.x[2]-1) {
row.add(x[j][i])
}
xList.add(row.toDoubleArray())
}
val xItem = Nd4j.create(xList.toTypedArray())
val y = model.output(xItem)
val result = ArrayList<Double>()
return result.toTypedArray()
}
}
upd. Seems like next example have "near" task, so later I'll check it and post solution : https://github.com/deeplearning4j/dl4j-0.4-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/word2vecsentiment/Word2VecSentimentRNN.java

LSTM input/output can only be rank 3: see:
http://deeplearning4j.org/usingrnns

next to the recommendation to post this in the very active gitter and the hint of Adam to check out the great documentation, which explains how to set up the in- and output being of rank 3, I want to point out a few other things in your code, as I was struggling with similar problems:
check out the basic example here in examples/recurrent/basic/BasicRNNExample.java, here you see that for RNN you don't use model.output(xItem), but model.rnnTimeStep(xItem);
with class count of one you seem to be performing a regression, for that also check out the regression examples at examples/feedforward/regression/RegressionSum.java and documenation here, here you see that as an activiation function you should use "identity". "softmax" actually normalizes the output to sum up to one (see in glossary), so if you have just one output it will always output 1 (at least it did for my problem).

Not sure if I understand your requirements correctly, but if you want single output (that is predict a number or regression), you usually go with Identity activation, and MSE loss function. You've used softmax, which is usually used in classificatoin.

Related

NEAT - population number varies every generation

I'm trying to implement my own neat implementation and I can't get myself to understand how speciation works
I tried my best to follow the pesudocode I found in this paper (start of page 13)
but I'm think I'm doing it really wrong but I don't understand the right way to do it, here is my code
the sepciate function that splits the population into species:
function speciate(population, species=[]) {
let newSpecies = [...species];
for(const net of population) {
let placed = false;
for(const s of newSpecies) {
for(const member of s) {
if(sh(net, member)) {
s.push(net);
placed = true;
break;
}
}
if(placed) break;
}
if(!placed) {
newSpecies.push([net]);
}
}
return newSpecies;
}
the repopulation function that generates a new population using the number of offsprings:
function repopulate(popCount, species) {
let globalAvg = 0;
species.forEach(s => {
globalAvg += s.reduce((P, net) => P + net.genome.fitness, 0) / s.length;
});
let newPop = [];
for(const s of species) {
let N = popCount;
let sAvg = s.reduce((P, net) => P + net.genome.fitness, 0) / s.length;
let offspringCount = (sAvg / globalAvg) * N;
for(let i = 0; i < offspringCount; i++) {
let parent1 = wheelSelect(s);
let parent2 = wheelSelect(s);
let child = parent1.genome.crossover(parent2.genome);
child.mutateAddNeuron(0.01);
child.mutateAddConnection(0.01);
child.mutateWeight(0.01);
child.mutateEnabledToggle(0.01);
child.layerNeurons();
let net = new NeuralNetwork();
net.wireUsingGenome(child);
newPop.push(net);
}
}
return newPop;
}
the problem I'm facing is that the population number seems to change every new generation sometimes it goes up and sometimes down, so I'm gussing I'm calculating the offspring count wrong or my speciation isn't working correctly but I can't figure it out
any help is appreciated!

How to fix NPE when transforming RasterFrameLayer into Raster?

I'm trying to convert a predicted RasterFrameLayer in RasterFrames into a GeoTiff file after training a machine learning model.
When using the demo data Elkton-VA from rasterframes,it works fine.
But when using one cropping sentinel 2a tif with ndvi indice (normalized from -1000 to 1000), it failed with NullPointedException in toRaster step.
Feel like it's due to nodata value outside the ROI.
The test data is here, geojson and log.
Geotrellis version:3.3.0
Rasterframes version:0.9.0
import geotrellis.proj4.LatLng
import geotrellis.raster._
import geotrellis.raster.io.geotiff.{MultibandGeoTiff, SinglebandGeoTiff}
import geotrellis.raster.io.geotiff.reader.GeoTiffReader
import geotrellis.raster.render.{ColorRamps, Png}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql._
import org.locationtech.rasterframes._
import org.locationtech.rasterframes.ml.{NoDataFilter, TileExploder}
object ClassificiationRaster extends App {
def readTiff(name: String) = GeoTiffReader.readSingleband(getClass.getResource(s"/$name").getPath)
def readMtbTiff(name: String): MultibandGeoTiff = GeoTiffReader.readMultiband(getClass.getResource(s"/$name").getPath)
implicit val spark = SparkSession.builder()
.master("local[*]")
.appName(getClass.getName)
.withKryoSerialization
.getOrCreate()
.withRasterFrames
import spark.implicits._
val filenamePattern = "xiangfuqu_202003_mask_%s.tif"
val bandNumbers = "ndvi".split(",").toSeq
val bandColNames = bandNumbers.map(b ⇒ s"band_$b").toArray
val tileSize = 256
val joinedRF: RasterFrameLayer = bandNumbers
.map { b ⇒ (b, filenamePattern.format(b)) }
.map { case (b, f) ⇒ (b, readTiff(f)) }
.map { case (b, t) ⇒ t.projectedRaster.toLayer(tileSize, tileSize, s"band_$b") }
.reduce(_ spatialJoin _)
.withCRS()
.withExtent()
val tlm = joinedRF.tileLayerMetadata.left.get
// println(tlm.totalDimensions.cols)
// println(tlm.totalDimensions.rows)
joinedRF.printSchema()
val targetCol = "label"
val geojsonPath = "/Users/ethan/work/data/L2a10m4326/zds/test.geojson"
spark.sparkContext.addFile(geojsonPath)
import org.locationtech.rasterframes.datasource.geojson._
val jsonDF: DataFrame = spark.read.geojson.load(geojsonPath)
val label_df: DataFrame = jsonDF
.select($"CLASS_ID", st_reproject($"geometry",LatLng,LatLng).alias("geometry"))
.hint("broadcast")
val df_joined = joinedRF.join(label_df, st_intersects(st_geometry($"extent"), $"geometry"))
.withColumn("dims",rf_dimensions($"band_ndvi"))
val df_labeled: DataFrame = df_joined.withColumn(
"label",
rf_rasterize($"geometry", st_geometry($"extent"), $"CLASS_ID", $"dims.cols", $"dims.rows")
)
df_labeled.printSchema()
val tmp = df_labeled.filter(rf_tile_sum($"label") > 0).cache()
val exploder = new TileExploder()
val noDataFilter = new NoDataFilter().setInputCols(bandColNames :+ targetCol)
val assembler = new VectorAssembler()
.setInputCols(bandColNames)
.setOutputCol("features")
val classifier = new DecisionTreeClassifier()
.setLabelCol(targetCol)
.setFeaturesCol(assembler.getOutputCol)
val pipeline = new Pipeline()
.setStages(Array(exploder, noDataFilter, assembler, classifier))
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol(targetCol)
.setPredictionCol("prediction")
.setMetricName("f1")
val paramGrid = new ParamGridBuilder()
//.addGrid(classifier.maxDepth, Array(1, 2, 3, 4))
.build()
val trainer = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(evaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(4)
val model = trainer.fit(tmp)
val metrics = model.getEstimatorParamMaps
.map(_.toSeq.map(p ⇒ s"${p.param.name} = ${p.value}"))
.map(_.mkString(", "))
.zip(model.avgMetrics)
metrics.toSeq.toDF("params", "metric").show(false)
val scored = model.bestModel.transform(joinedRF)
scored.groupBy($"prediction" as "class").count().show
scored.show(20)
val retiled: DataFrame = scored.groupBy($"crs", $"extent").agg(
rf_assemble_tile(
$"column_index", $"row_index", $"prediction",
tlm.tileCols, tlm.tileRows, IntConstantNoDataCellType
)
)
val rf: RasterFrameLayer = retiled.toLayer(tlm)
val raster: ProjectedRaster[Tile] = rf.toRaster($"prediction", 5848, 4189)
SinglebandGeoTiff(raster.tile,tlm.extent, tlm.crs).write("/Users/ethan/project/IdeaProjects/learn/spark_ml_learn.git/src/main/resources/easy_b1.tif")
val clusterColors = ColorRamp(
ColorRamps.Viridis.toColorMap((0 until 1).toArray).colors
)
// val pngBytes = retiled.select(rf_render_png($"prediction", clusterColors)).first //It can output the png.
// retiled.tile.renderPng(clusterColors).write("/Users/ethan/project/IdeaProjects/learn/spark_ml_learn.git/src/main/resources/classified2.png")
// Png(pngBytes).write("/Users/ethan/project/IdeaProjects/learn/spark_ml_learn.git/src/main/resources/classified2.png")
spark.stop()
}
I suspect there is a bug in the way the toLayer extension method is working. I will follow up with a bug report to RasterFrames project. That will take a little more effort I suspect.
Here is a possible workaround that is a little bit lower level. In this case it results in 25 non-overlapping GeoTiffs written out.
import geotrellis.store.hadoop.{SerializableConfiguration, _}
import geotrellis.spark.Implicits._
import org.apache.hadoop.fs.Path
// Need this to write local files from spark
val hconf = SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
ContextRDD(
rf.toTileLayerRDD($"prediction")
.left.get
.filter{
case (_: SpatialKey, null) ⇒ false // remove any null Tiles
case _ ⇒ true
},
tlm)
.regrid(1024) //Regrid the Tiles so that they are 1024 x 1024
.toGeoTiffs()
.foreach{ case (sk: SpatialKey, gt: SinglebandGeoTiff) ⇒
val path = new Path(new Path("file:///tmp/output"), s"${sk.col}_${sk.row}.tif")
gt.write(path, hconf.value)
}

How to do BigInt arithmetic in Dart 2.x, specifically division?

Dart documentation says that BigInt division returns a value of type 'double'. This is a problem. To illustrate, here are two implementations of an algorithm involving division. The first is in Kotlin, the second is in Dart. The Dart version runs accurately for small numbers but loses precision for larger numbers.
Kotlin
import java.math.BigInteger
fun height(n: BigInteger, m: BigInteger): BigInteger {
var m1 = m
var s = BigInteger("1")
var b = BigInteger("1")
var ans = BigInteger("0")
var i = 0
while (i < n.toInt()) {
s *= m1--
s /= b++
ans += s
i++
}
return ans
}
Dart
BigInt height(int n, int m) {
var m1 = m; // new BigInt.from(m);
var s = 1.0; // new BigInt.from(1);
var b = 1.0; // new BigInt.from(1);
var ans = new BigInt.from(0);
var i = 0;
while (i < n) {
s *= m1--;
s /= b++;
ans += BigInt.from(s);
i++;
}
return ans;
}
As you can see from the commented out Dart code, I have tried various ways to use BigInt.
Here is an example input with answer. The erroneous Dart answer is given below.
height(13, 550),
equals(BigInt.parse('60113767426276772744951355')));
The erroneous Dart answer is --> 60113767426276764034189615
Can someone show me the best way to do the job in Dart v2.x?
The following code works.
BigInt height(int n, int m) {
var m1 = new BigInt.from(m);
var s = new BigInt.from(1);
var b = new BigInt.from(1);
var ans = new BigInt.from(0);
var i = 0;
while (i < n) {
s *= m1;
m1 -= new BigInt.from(1);
s = s ~/ b;
b += new BigInt.from(1);
ans += s;
i++;
}
return ans;
}
Changes:
x++ and x-- are equivalent to x = x + 1 and x = x - 1 but BigInt.+ and BigInt.- only accept BigInt values... so there's a compiler error.
BigInt./ returns a double and this is not what you want here. You need to use the BigInt.~/ operator instead.

ANN regression, linear function approximation

I have built a regular ANN–BP setup with one unit on input and output layer and 4 nodes in hidden with sigmoid. Giving it a simple task to approximate linear f(n) = n with n in range 0-100.
PROBLEM: Regardless of number of layers, units in hidden layer or whether or not I am using bias in node values it learns to approximate f(n) = Average(dataset) like so:
Code is written in JavaScript as a proof of concept. I have defined three classes: Net, Layer and Connection, where Layer is an array of input, bias and output values, Connection is a 2D array of weights and delta weights. Here is the Layer code where all important calculations happen:
Ann.Layer = function(nId, oNet, oConfig, bUseBias, aInitBiases) {
var _oThis = this;
var _initialize = function() {
_oThis.id = nId;
_oThis.length = oConfig.nodes;
_oThis.outputs = new Array(oConfig.nodes);
_oThis.inputs = new Array(oConfig.nodes);
_oThis.gradients = new Array(oConfig.nodes);
_oThis.biases = new Array(oConfig.nodes);
_oThis.outputs.fill(0);
_oThis.inputs.fill(0);
_oThis.biases.fill(0);
if (bUseBias) {
for (var n=0; n<oConfig.nodes; n++) {
_oThis.biases[n] = Ann.random(aInitBiases[0], aInitBiases[1]);
}
}
};
/****************** PUBLIC ******************/
this.id;
this.length;
this.inputs;
this.outputs;
this.gradients;
this.biases;
this.next;
this.previous;
this.inConnection;
this.outConnection;
this.isInput = function() { return !this.previous; }
this.isOutput = function() { return !this.next; }
this.calculateGradients = function(aTarget) {
var n, n1, nOutputError,
fDerivative = Ann.Activation.Derivative[oConfig.activation];
if (this.isOutput()) {
for (n=0; n<oConfig.nodes; n++) {
nOutputError = this.outputs[n] - aTarget[n];
this.gradients[n] = nOutputError * fDerivative(this.outputs[n]);
}
} else {
for (n=0; n<oConfig.nodes; n++) {
nOutputError = 0.0;
for (n1=0; n1<this.outConnection.weights[n].length; n1++) {
nOutputError += this.outConnection.weights[n][n1] * this.next.gradients[n1];
}
// console.log(this.id, nOutputError, this.outputs[n], fDerivative(this.outputs[n]));
this.gradients[n] = nOutputError * fDerivative(this.outputs[n]);
}
}
}
this.updateInputWeights = function() {
if (!this.isInput()) {
var nY,
nX,
nOldDeltaWeight,
nNewDeltaWeight;
for (nX=0; nX<this.previous.length; nX++) {
for (nY=0; nY<this.length; nY++) {
nOldDeltaWeight = this.inConnection.deltaWeights[nX][nY];
nNewDeltaWeight =
- oNet.learningRate
* this.previous.outputs[nX]
* this.gradients[nY]
// Add momentum, a fraction of old delta weight
+ oNet.learningMomentum
* nOldDeltaWeight;
if (nNewDeltaWeight == 0 && nOldDeltaWeight != 0) {
console.log('Double overflow');
}
this.inConnection.deltaWeights[nX][nY] = nNewDeltaWeight;
this.inConnection.weights[nX][nY] += nNewDeltaWeight;
}
}
}
}
this.updateInputBiases = function() {
if (bUseBias && !this.isInput()) {
var n,
nNewDeltaBias;
for (n=0; n<this.length; n++) {
nNewDeltaBias =
- oNet.learningRate
* this.gradients[n];
this.biases[n] += nNewDeltaBias;
}
}
}
this.feedForward = function(a) {
var fActivation = Ann.Activation[oConfig.activation];
this.inputs = a;
if (this.isInput()) {
this.outputs = this.inputs;
} else {
for (var n=0; n<a.length; n++) {
this.outputs[n] = fActivation(a[n] + this.biases[n]);
}
}
if (!this.isOutput()) {
this.outConnection.feedForward(this.outputs);
}
}
_initialize();
}
The main feedForward and backProp functions are defined like so:
this.feedForward = function(a) {
this.layers[0].feedForward(a);
this.netError = 0;
}
this.backPropagate = function(aExample, aTarget) {
this.target = aTarget;
if (aExample.length != this.getInputCount()) { throw "Wrong input count in training data"; }
if (aTarget.length != this.getOutputCount()) { throw "Wrong output count in training data"; }
this.feedForward(aExample);
_calculateNetError(aTarget);
var oLayer = null,
nLast = this.layers.length-1,
n;
for (n=nLast; n>0; n--) {
if (n === nLast) {
this.layers[n].calculateGradients(aTarget);
} else {
this.layers[n].calculateGradients();
}
}
for (n=nLast; n>0; n--) {
this.layers[n].updateInputWeights();
this.layers[n].updateInputBiases();
}
}
Connection code is rather simple:
Ann.Connection = function(oNet, oConfig, aInitWeights) {
var _oThis = this;
var _initialize = function() {
var nX, nY, nIn, nOut;
_oThis.from = oNet.layers[oConfig.from];
_oThis.to = oNet.layers[oConfig.to];
nIn = _oThis.from.length;
nOut = _oThis.to.length;
_oThis.weights = new Array(nIn);
_oThis.deltaWeights = new Array(nIn);
for (nX=0; nX<nIn; nX++) {
_oThis.weights[nX] = new Array(nOut);
_oThis.deltaWeights[nX] = new Array(nOut);
_oThis.deltaWeights[nX].fill(0);
for (nY=0; nY<nOut; nY++) {
_oThis.weights[nX][nY] = Ann.random(aInitWeights[0], aInitWeights[1]);
}
}
};
/****************** PUBLIC ******************/
this.weights;
this.deltaWeights;
this.from;
this.to;
this.feedForward = function(a) {
var n, nX, nY, aOut = new Array(this.to.length);
for (nY=0; nY<this.to.length; nY++) {
n = 0;
for (nX=0; nX<this.from.length; nX++) {
n += a[nX] * this.weights[nX][nY];
}
aOut[nY] = n;
}
this.to.feedForward(aOut);
}
_initialize();
}
And my activation functions and derivatives are defined like so:
Ann.Activation = {
linear : function(n) { return n; },
sigma : function(n) { return 1.0 / (1.0 + Math.exp(-n)); },
tanh : function(n) { return Math.tanh(n); }
}
Ann.Activation.Derivative = {
linear : function(n) { return 1.0; },
sigma : function(n) { return n * (1.0 - n); },
tanh : function(n) { return 1.0 - n * n; }
}
And configuration JSON for the network is as follows:
var Config = {
id : "Config1",
learning_rate : 0.01,
learning_momentum : 0,
init_weight : [-1, 1],
init_bias : [-1, 1],
use_bias : false,
layers: [
{nodes : 1},
{nodes : 4, activation : "sigma"},
{nodes : 1, activation : "linear"}
],
connections: [
{from : 0, to : 1},
{from : 1, to : 2}
]
}
Perhaps, your experienced eye can spot the problem with my calculations?
See example in JSFiddle
I did not look extensively at the code (because it is a lot of code to look at, would need to take more time for that later, and I am not 100% familiar with javascript). Either way, I believe Stephen introduced some changes in how the weights are calculated, and his code seems to give correct results, so I'd recommend looking at that.
Here are a few points though that are not necessarily about the correctness of computations, but may still help:
How many examples are you showing the network for training? Are you showing the same input multiple times? You should show every example that you have (inputs) multiple times; showing every example only one time is not sufficient for algorithms based on gradient descent to learn, since they only move a little bit in the correct direction every time. It is possible that all of your code is correct, but you simply have to give it a bit more time to train.
Introducing more hidden layers like Stephen did may help to speed up training, or it may be detrimental. This is typically something you'd want to experiment with for your specific case. It definitely shouldn't be necessary for this simple problem though. I suspect a more important difference between your configuration and Stephen's configuration may be the activation function used in the hidden layer(s). You used a sigmoid, which means that all of the input values get squashed to lie below 1.0 in the hidden layer, and then you need to very large weights to transform these numbers back to the desired output (which can be up to a value of 100). Stephen used linear activation functions for all layers, which in this specific case is likely to make training much easier because you are actually trying to learn a linear function. In many other cases it would be desirable to introduce non-linearities though.
It may be beneficial to transform (normalize) both your input and your desired output to lie in [0, 1] instead of [0, 100]. This would make it more likely for your sigmoid layer to produce good results (though I'm still not sure if it would be enough, because you're still introducing a nonlinearity in a case where you intend to learn a linear function, and you may need more hidden nodes to correct for that). In ''real-world'' cases, where you have multiple different input variables, this is also typically done, because it ensures that all input variables are treated as being equally important initially. You could always do a preprocessing step where you normalize the input to [0, 1], give that as input to the network, train it to produce output in [0, 1], and then add a postprocessing step where you transform the output back to the original range.
First... I really like this code. I know very little about NNs (just getting started) so pardon my lacking here if any.
Here is a summary of the changes I made:
//updateInputWeights has this in the middle now:
nNewDeltaWeight =
oNet.learningRate
* this.gradients[nY]
/ this.previous.outputs[nX]
// Add momentum, a fraction of old delta weight
+ oNet.learningMomentum
* nOldDeltaWeight;
//updateInputWeights has this at the bottom now:
this.inConnection.deltaWeights[nX][nY] += nNewDeltaWeight; // += added
this.inConnection.weights[nX][nY] += nNewDeltaWeight;
// I modified the following:
_calculateNetError2 = function(aTarget) {
var oOutputLayer = _oThis.getOutputLayer(),
nOutputCount = oOutputLayer.length,
nError = 0.0,
nDelta = 0.0,
n;
for (n=0; n<nOutputCount; n++) {
nDelta = aTarget[n] - oOutputLayer.outputs[n];
nError += nDelta;
}
_oThis.netError = nError;
};
The config section looks like this now:
var Config = {
id : "Config1",
learning_rate : 0.001,
learning_momentum : 0.001,
init_weight : [-1.0, 1.0],
init_bias : [-1.0, 1.0],
use_bias : false,
/*
layers: [
{nodes : 1, activation : "linear"},
{nodes : 5, activation : "linear"},
{nodes : 1, activation : "linear"}
],
connections: [
{from : 0, to : 1}
,{from : 1, to : 2}
]
*/
layers: [
{nodes : 1, activation : "linear"},
{nodes : 2, activation : "linear"},
{nodes : 2, activation : "linear"},
{nodes : 2, activation : "linear"},
{nodes : 2, activation : "linear"},
{nodes : 1, activation : "linear"}
],
connections: [
{from : 0, to : 1}
,{from : 1, to : 2}
,{from : 2, to : 3}
,{from : 3, to : 4}
,{from : 4, to : 5}
]
}

help in coding decision tree in python

I am not sure if this is the right place to post this, but I have been trying to code up a simple decision tree class for a while and am getting lost at various points.
Specifically, I'm not sure what kind of data structure would represent a recursive tree that uses (feature, value) as nodes.
class DecisionTree():
def entropy(self, data):
# if there's nothing in this region, entropy is 1
if len(data) <= 1:
return 1
target_col = data.ix[:,-1]
size = float(len(target_col))
classes = Counter(target_col)
# if there's only one class, entropy is 1
if len(classes) == 1:
return 1
else:
probs = [i / size for i in classes.values()]
entropy = np.sum([-probs[i]*np.log(probs[i]) for i in range(len(probs))])
return entropy
def what_to_split_on(self, data):
split_feature = -1
best_entropy = 0.0
base_entropy = self.entropy(data)
for f, feature in enumerate(data.T):
unique_vals = list(set(feature))
for val in unique_vals:
left, right = self.split(f, val)
prop_left = float(len(left)) / (len(left) + len(right))
prop_right = 1 - prop_left
e_1 = prop_left * self.entropy(left)
e_2 = prop_right * self.entropy(right)
entropy_change = base_entropy - e_1 - e_2
if entropy_change > best_entropy:
best_entropy = entropy_change
split_feature = f; split_val = val
if split_feature != -1:
return split_feature, split_val
def split(self, data, f, val):
left = np.array([row for row in data if row[f] == val])
right = np.array([row for row in data if row[f] != val])
return left, right
def create_tree(self, data):
if self.entropy(data) == 1:
return
feature, value = self.what_to_split_on(data)
dt = Tree(feature, value)
left_child = np.array([row for row in data if row[feature] == value])
right_child = np.array([row for row in data if row[feature] == value])
feature, value = self.what_to_split_on(left_child)
sub_left = create_tree(left_child)
dt.insert_left(sub_left)
feature, value = self.what_to_split_on(right_child)
sub_right = create_tree(right_child)
dt.insert_right(sub_right)
return dt

Resources