How to interpret YOLOv4 output - ios
Im trying to interpret the YOLOv4 output using TensorFlowLite on the iOS. I have read a little about the concept of cells and anchors used in the output tensor, and I would like to implement the parser for it. Output of my model consists of 2 tensors:
float32[1,13,13,255]
float32[1,26,26,255]
First thing Im wondering about is what is in the second output? From what I have read the first one should contain all the information. Is the second one just a more detailed result or something else?
Ok, but let's start with the first output. I have written a simple function that should extract all cells and then all anchors data from these cells. This is how it looks:
let output0 = try localModel.output(at: 0)
guard let output0Floats = [Float](bytes: output0.data) else { return nil }
let numberOfCells = 13
let numberOfAnchors = 3
let numberOfClasses = 80
let anchorSize = (numberOfClasses + 5)
func cellAt(x: Int, y: Int) -> [Float] {
let cellSize = anchorSize * numberOfAnchors
let position = (y * numberOfCells + x) * cellSize
return [Float](output0Floats[position..<position + cellSize])
}
func anchors(in cell: [Float]) -> [[Float]] {
(0..<numberOfAnchors).map { [Float](cell[$0 * anchorSize..<$0 * anchorSize + anchorSize]) }
}
for y in 0..<numberOfCells {
for x in 0..<numberOfCells {
let cell = cellAt(x: x, y: y)
print("Cell: \(x),\(y) contains anchors:")
print(anchors(in: cell))
}
}
...
private extension Array {
init?(bytes: Data) {
guard bytes.count % MemoryLayout<Element>.stride == 0 else { return nil }
self = bytes.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
}
}
And this is the example result I'm getting for a single cell:
Cell: 7,12 contains anchors:
[[0.693655, -1.1966848, -0.007975042, -0.3327814, -9.583811, 0.3976263, -6.0192285, -6.329881, -5.8644676, -10.2914715, -9.632221, -8.071436, -6.399925, -5.240812, -8.791572, -5.6437893, -9.8603115, -10.492198, -1.9372412, -7.0640965, -2.6936512, -5.112247, -7.131972, -7.1825066, -7.4413238, -10.401382, -7.5643044, -8.608834, -8.239082, -6.799241, -8.035741, -5.7502255, -8.881622, -7.3571744, -9.315964, -7.925786, -7.7857537, -4.8930154, -8.529579, -7.633353, -8.817726, -7.47082, -8.291334, -4.683982, -4.170734, -6.193165, -7.8437185, -9.854808, -9.490823, -8.272433, -8.434413, -7.765057, -7.149798, -11.194118, -6.5116143, -11.112444, -9.999684, -10.689343, -9.942104, -9.520727, -7.440444, -2.531265, -3.7234814, -7.5839844, -4.550161, -3.031804, -4.616852, -8.832014, -6.0279136, -9.482858, -6.750441, -8.450063, -10.222086, -7.6301804, -7.559189, -10.234117, -6.999834, -7.1350074, -5.308107, -6.2450233, -8.8833885, -9.381562, -3.8812854, -8.868278, -9.988986], [0.4351927, -1.3958519, 0.46428338, -0.39240548, -8.170114, 0.7084342, -7.709829, -5.9856057, -6.808081, -10.644019, -9.912677, -7.3293757, -7.548369, -5.533275, -10.072926, -7.316476, -9.945337, -11.118561, -3.2463353, -10.561513, -5.067392, -7.312641, -8.729989, -9.5539055, -7.58917, -9.886164, -6.5404315, -8.553915, -9.023286, -9.580754, -6.7592535, -8.380334, -8.182065, -7.2239976, -9.276712, -7.5086412, -7.2454534, -7.139829, -8.614485, -7.8158274, -9.850543, -9.123642, -6.8081083, -6.936388, -7.997142, -8.845028, -11.322939, -10.713314, -9.629859, -10.820017, -10.480835, -9.071951, -7.9244685, -12.562474, -7.1654305, -13.456438, -10.116255, -12.255847, -11.530319, -10.3949375, -10.665162, -5.6975913, -4.050809, -10.665826, -2.638548, -3.5531735, -7.0320325, -10.047072, -7.678191, -10.290669, -7.438999, -7.531754, -9.817409, -8.428637, -9.502961, -10.955662, -8.6340065, -5.0168147, -8.593948, -9.412493, -10.816083, -10.903126, -8.81499, -10.449745, -9.069517], [0.025469145, -1.7808459, -0.18256505, -0.70104045, -10.450736, -0.67288893, -5.771856, -5.448979, -6.4159226, -8.777289, -7.960696, -5.3555217, -4.798117, -2.8378687, -7.9489646, -8.255625, -8.968552, -8.036578, -2.46956, -8.458385, -4.8979797, -6.5746903, -7.2408285, -8.574903, -6.8356185, -6.4320874, -6.037178, -7.56021, -7.275848, -8.808907, -3.9019513, -8.835796, -6.360187, -6.5461373, -7.1117754, -6.6027184, -7.280362, -7.1671834, -7.292713, -7.1488175, -7.1398635, -8.180893, -5.797153, -6.3417816, -6.9332256, -8.371075, -9.2042055, -8.602686, -8.072069, -8.1690035, -8.0164175, -6.61691, -6.3536263, -9.318304, -4.5542707, -10.049933, -7.8087454, -9.497473, -9.07455, -8.406244, -7.078502, -5.5775504, -2.3586287, -8.409487, -1.6716739, -3.8225765, -6.9020715, -6.6682305, -5.784493, -8.40492, -7.2747784, -6.392035, -6.4958863, -7.629692, -7.4995623, -8.4432125, -6.7565637, -3.113231, -7.3596015, -8.573539, -8.829562, -8.523581, -8.571439, -8.087017, -7.958835]]
So single anchor looks like this:
[0.693655, -1.1966848, -0.007975042, -0.3327814, -9.583811, 0.3976263, -6.0192285, -6.329881, -5.8644676, -10.2914715, -9.632221, -8.071436, -6.399925, -5.240812, -8.791572, -5.6437893, -9.8603115, -10.492198, -1.9372412, -7.0640965, -2.6936512, -5.112247, -7.131972, -7.1825066, -7.4413238, -10.401382, -7.5643044, -8.608834, -8.239082, -6.799241, -8.035741, -5.7502255, -8.881622, -7.3571744, -9.315964, -7.925786, -7.7857537, -4.8930154, -8.529579, -7.633353, -8.817726, -7.47082, -8.291334, -4.683982, -4.170734, -6.193165, -7.8437185, -9.854808, -9.490823, -8.272433, -8.434413, -7.765057, -7.149798, -11.194118, -6.5116143, -11.112444, -9.999684, -10.689343, -9.942104, -9.520727, -7.440444, -2.531265, -3.7234814, -7.5839844, -4.550161, -3.031804, -4.616852, -8.832014, -6.0279136, -9.482858, -6.750441, -8.450063, -10.222086, -7.6301804, -7.559189, -10.234117, -6.999834, -7.1350074, -5.308107, -6.2450233, -8.8833885, -9.381562, -3.8812854, -8.868278, -9.988986]
Now I can't understand these numbers. From what I read, first 5 numbers should be:
Confidence, BBoxX, BBoxY, BBoxWidth, BBoxHeight
and the rest of the values are probabilities of each class in a labelMap.
But these numbers look totally incorrect to me. Shouldn't Confidence be between 0 and 1? And probabilities shouldn't be between 0 a 1? What can I be doing wrong that I'm getting these results? The code I'm using before parsing these results is well tested with other types of tflite files, and it should be fine. Can this be due to the incorrect imageMean and imageStd used in the input pixel buffer preparing? I'm not sure which values were used to build this model, so Im using 127.5 for both of these values.
Related
Sum All Odd Fibonacci Numbers
So i am supposed to add all the odd Fibonacci Numbers and return the sum. Here is my code but i am not getting it right. Sorry i am new at this so help me out. function sumFibs(num) { let secpre=0 let pre=1 let current=0; let arr=[pre] let y=[] let sum for(let i=1;i<=num;i++){ if(num==1){return 1} else if((secpre+pre)<num){ current=secpre+pre; secpre=pre; pre=current; arr.push(current) } } arr.map(x=>{if(x%2!==0){ return y.push(x) }}) y.reduce((a,b)=>{ sum=0; sum+=a+b; return sum }) console.log(y) console.log(sum) return sum } sumFibs(75025) for the value of sumFibs(75025), it should be 135721 but i am getting 60696.
You've had a nice idea, well done! Although, here is the solution for which I think is simpler to understand: function sumFib(num) { let odd_sum = 1; let nums = [0, 1]; while (odd_sum < num) { new_num = nums[nums.length - 2] + nums[nums.length - 1]; nums.push(new_num); if (new_num % 2 == 1) odd_sum += new_num; } return odd_sum; } console.log(sumFib(75025)); Feel free to ask if you are confused with something.
Establish ranges in array of Doubles
I am looking to establish the various ranges that may exist in an array of Double values. This is best explained with an example. Say I have the following set of numbers: [1.5, 1.6, 1.7, 1.8, 2.9, 3.0, 3.1, 4.0] I would like to be able to determine, with a given granularity (in this case 0.1), that the ranges in this set are: 1.5-1.8, 2.9-3.1, 4.0 Any ideas? Example data set for granularity of 0.01: [407.46, 407.47, 407.48, 407.49, 407.5, 407.51, 407.52, 407.53, 407.54, 407.55, 407.56, 407.57, 407.58, 407.59, 407.6, 407.61, 407.62, 407.63, 407.64, 407.65, 407.66, 407.67, 407.68, 407.69, 407.7, 407.71, 407.72, 407.73, 407.74, 407.75, 407.76, 407.77, 407.78, 407.79, 407.8, 407.81, 407.82, 407.83, 407.84, 407.85, 407.86, 407.87, 407.88, 407.89, 407.9, 407.91, 440.27, 440.28, 440.29, 440.3, 440.31, 440.32, 440.33, 440.34, 440.35, 440.36, 440.37, 440.38, 440.39, 440.4, 440.41, 440.42, 440.43, 440.44, 440.45, 440.46, 440.47, 440.48, 440.49, 440.5, 440.51, 440.52, 440.53, 440.54, 440.55, 440.56, 440.57, 440.58, 440.59, 440.6, 440.61, 440.62, 440.63, 440.64, 440.65, 440.66, 440.67, 440.68, 440.69, 440.7, 440.71, 440.72, 440.73, 440.74]
A longer and simpler implementation: func getRanges(from values: [Double], with granularity: Double) -> [ClosedRange<Double>] { if values.count == 1 { return [values[0]...values[0]] } var ranges = [ClosedRange<Double>]() var lowerBound: Double = 0 var upperBound: Double = 0 for (i, value) in values.enumerated() { if i == 0 { lowerBound = value upperBound = value continue } let multiplier: Double = (1 / granularity).rounded() let multipliedGranularity = granularity * multiplier if (value * multiplier - (upperBound * multiplier + multipliedGranularity)).isLess(than: multipliedGranularity) { upperBound = value } else { ranges.append(lowerBound...upperBound) lowerBound = value upperBound = value } if i == values.count - 1 { ranges.append(lowerBound...upperBound) } } return ranges } Using your sample the result is: ClosedRange(407.46...407.91) ClosedRange(440.27...440.74) This code works even with smaller granularities. For example, with granularity 0.0000000001 and the following values: [407.9999999991, 407.9999999992, 407.9999999994, 407.9999999995] result: ClosedRange(407.9999999991...407.9999999992) ClosedRange(407.9999999994...407.9999999995)
MPSCNN Weight Ordering
The Metal Performance Shader framework provides support for building your own Convolutional Neural Nets. When creating for instance an MSPCNNConvolution it requires a 4D weight tensor as init parameter that is represented as a 1D float pointer. init(device: MTLDevice, convolutionDescriptor: MPSCNNConvolutionDescriptor, kernelWeights: UnsafePointer<Float>, biasTerms: UnsafePointer<Float>?, flags: MPSCNNConvolutionFlags) The documentation has this to say about the 4D tensor The layout of the filter weight is arranged so that it can be reinterpreted as a 4D tensor (array) weight[outputChannels][kernelHeight][kernelWidth][inputChannels/groups] Unfortunately that information doesn't really tell me how to arrange a 4D array into a one dimensional Float pointer. I tried ordering the weights like the BNNS counterpart requires it, but without luck. How do I properly represent the 4D tensor (array) as a 1D Float pointer (array)? PS: I tried arranging it like a C array and getting the pointer to the flat array, but it didn't work. UPDATE #RhythmicFistman: That's how I stored it in a plain array, which I can convert to a UsafePointer<Float> (but doesn't work): var output = Array<Float>(repeating: 0, count: weights.count) for o in 0..<outputChannels { for ky in 0..<kernelHeight { for kx in 0..<kernelWidth { for i in 0..<inputChannels { let offset = ((o * kernelHeight + ky) * kernelWidth + kx) * inputChannels + i output[offset] = ... } } } }
Ok so I figured it out. Here are the 2 python functions I use to reform my convolutions and fully connected matrices # shape required for MPSCNN [oC kH kW iC] # tensorflow order is [kH kW iC oC] def convshape(a): a = np.swapaxes(a, 2, 3) a = np.swapaxes(a, 1, 2) a = np.swapaxes(a, 0, 1) return a # fully connected only requires a x/y swap def fullshape(a): a = np.swapaxes(a, 0, 1) return a
This is something I recently had to do for Caffe weights, so I can provide the Swift implementation for how I reordered those. The following function takes in a Float array of Caffe weights for a convolution (in [c_o][c_i][h][w] order) and reorders those to what Metal expects ([c_o][h][w][c_i] order): public func convertCaffeWeightsToMPS(_ weights:[Float], kernelSize:(width:Int, height:Int), inputChannels:Int, outputChannels:Int, groups:Int) -> [Float] { var weightArray:[Float] = Array(repeating:0.0, count:weights.count) var outputIndex = 0 let groupedInputChannels = inputChannels / groups let outputChannelWidth = groupedInputChannels * kernelSize.width * kernelSize.height // MPS ordering: [c_o][h][w][c_i] for outputChannel in 0..<outputChannels { for heightInKernel in 0..<kernelSize.height { for widthInKernel in 0..<kernelSize.width { for inputChannel in 0..<groupedInputChannels { // Caffe ordering: [c_o][c_i][h][w] let calculatedIndex = outputChannel * outputChannelWidth + inputChannel * kernelSize.width * kernelSize.height + heightInKernel * kernelSize.width + widthInKernel weightArray[outputIndex] = weights[calculatedIndex] outputIndex += 1 } } } } return weightArray } Based on my layer visualization, this seems to generate the correct convolution results (matching those produced by Caffe). I believe it also properly takes grouping into account, but I need to verify that. Tensorflow has a different ordering than Caffe, but you should be able to change the math in the inner part of the loop to account for that.
The documentation here assumes some expertise in C. In that context, a[x][y][z] is typically collapsed into a 1-d array when x, y and z are constants known at compile time. When this happens, the z component varies most quickly, followed by y, followed by x -- outside in. If we have a[2][2][2], it is collapsed to 1D as: { a[0][0][0], a[0][0][1], a[0][1][0], a[0][1][1], a[1][0][0], a[1][0][1], a[1][1][0], a[1][1][1] }
I think tensorflow already has a convenient method for such task: tf.transpose(aWeightTensor, perm=[3, 0, 1, 2]) Full documentation: https://www.tensorflow.org/api_docs/python/tf/transpose
Deeplearning4j LSTM output size
I my case - at input I have List<List<Float>> (list of word representation vectors). And - have one Double at output from one sequence. So I building next structure (first index - example number, second - sentence item number, third - word vector element number) : http://pastebin.com/KGdjwnki And in output : http://pastebin.com/fY8zrxEL But when I masting one of next (http://pastebin.com/wvFFC4Hw) to model.output - I getting vector [0.25, 0.24, 0.25, 0.25], not one value. What can be wrong? Attached code (at Kotlin). classCount is one. import org.deeplearning4j.nn.multilayer.MultiLayerNetwork import org.deeplearning4j.nn.conf.NeuralNetConfiguration.Builder import org.deeplearning4j.nn.api.OptimizationAlgorithm import org.deeplearning4j.nn.conf.Updater import org.deeplearning4j.nn.weights.WeightInit import org.deeplearning4j.nn.conf.layers.GravesLSTM import org.deeplearning4j.nn.conf.layers.RnnOutputLayer import org.deeplearning4j.nn.conf.BackpropType import org.nd4j.linalg.api.ndarray.INDArray import org.nd4j.linalg.cpu.nativecpu.NDArray import org.nd4j.linalg.indexing.NDArrayIndex import org.nd4j.linalg.factory.Nd4j import org.nd4j.linalg.lossfunctions.LossFunctions import java.util.* class ClassifierNetwork(wordVectorSize: Int, classCount: Int) { data class Dimension(val x: Array<Int>, val y: Array<Int>) val model: MultiLayerNetwork val optimization = OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT val iterations = 1 val learningRate = 0.1 val rmsDecay = 0.95 val seed = 12345 val l2 = 0.001 val weightInit = WeightInit.XAVIER val updater = Updater.RMSPROP val backtropType = BackpropType.TruncatedBPTT val tbpttLength = 50 val epochs = 50 var dimensions = Dimension(intArrayOf(0).toTypedArray(), intArrayOf(0).toTypedArray()) init { val baseConfiguration = Builder().optimizationAlgo(optimization) .iterations(iterations).learningRate(learningRate).rmsDecay(rmsDecay).seed(seed).regularization(true).l2(l2) .weightInit(weightInit).updater(updater) .list() baseConfiguration.layer(0, GravesLSTM.Builder().nIn(wordVectorSize).nOut(64).activation("tanh").build()) baseConfiguration.layer(1, GravesLSTM.Builder().nIn(64).nOut(32).activation("tanh").build()) baseConfiguration.layer(2, GravesLSTM.Builder().nIn(32).nOut(16).activation("tanh").build()) baseConfiguration.layer(3, RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MCXENT) .activation("softmax").weightInit(WeightInit.XAVIER).nIn(16).nOut(classCount).build()) val cfg = baseConfiguration.build()!! cfg.backpropType = backtropType cfg.tbpttBackLength = tbpttLength cfg.tbpttFwdLength = tbpttLength cfg.isPretrain = false cfg.isBackprop = true model = MultiLayerNetwork(cfg) } private fun dataDimensions(x: List<List<Array<Double>>>, y: List<Array<Double>>): Dimension { assert(x.size == y.size) val exampleCount = x.size assert(x.size > 0) val sentenceLength = x[0].size assert(sentenceLength > 0) val wordVectorLength = x[0][0].size assert(wordVectorLength > 0) val classCount = y[0].size assert(classCount > 0) return Dimension( intArrayOf(exampleCount, wordVectorLength, sentenceLength).toTypedArray(), intArrayOf(exampleCount, classCount).toTypedArray() ) } data class Fits(val x: INDArray, val y: INDArray) private fun fitConversion(x: List<List<Array<Double>>>, y: List<Array<Double>>): Fits { val dim = dataDimensions(x, y) val xItems = ArrayList<INDArray>() for (i in 0..dim.x[0]-1) { val itemList = ArrayList<DoubleArray>(); for (j in 0..dim.x[1]-1) { var rowList = ArrayList<Double>() for (k in 0..dim.x[2]-1) { rowList.add(x[i][k][j]) } itemList.add(rowList.toTypedArray().toDoubleArray()) } xItems.add(Nd4j.create(itemList.toTypedArray())) } val xFits = Nd4j.create(xItems, dim.x.toIntArray(), 'c') val yItems = ArrayList<DoubleArray>(); for (i in 0..y.size-1) { yItems.add(y[i].toDoubleArray()) } val yFits = Nd4j.create(yItems.toTypedArray()) return Fits(xFits, yFits) } private fun error(epoch: Int, x: List<List<Array<Double>>>, y: List<Array<Double>>) { var totalDiff = 0.0 for (i in 0..x.size-1) { val source = x[i] val result = y[i] val realResult = predict(source) var diff = 0.0 for (j in 0..result.size-1) { val elementDiff = result[j] - realResult[j] diff += Math.pow(elementDiff, 2.0) } diff = Math.sqrt(diff) totalDiff += Math.pow(diff, 2.0) } totalDiff = Math.sqrt(totalDiff) print("Epoch ") print(epoch) print(", diff ") println(totalDiff) } fun train(x: List<List<Array<Double>>>, y: List<Array<Double>>) { dimensions = dataDimensions(x, y) val(xFit, yFit) = fitConversion(x, y) for (i in 0..epochs-1) { model.input = xFit model.labels = yFit model.fit() error(i+1, x, y) } } fun predict(x: List<Array<Double>>): Array<Double> { val xList = ArrayList<DoubleArray>(); for (i in 0..dimensions.x[1]-1) { var row = ArrayList<Double>() for (j in 0..dimensions.x[2]-1) { row.add(x[j][i]) } xList.add(row.toDoubleArray()) } val xItem = Nd4j.create(xList.toTypedArray()) val y = model.output(xItem) val result = ArrayList<Double>() return result.toTypedArray() } } upd. Seems like next example have "near" task, so later I'll check it and post solution : https://github.com/deeplearning4j/dl4j-0.4-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/word2vecsentiment/Word2VecSentimentRNN.java
LSTM input/output can only be rank 3: see: http://deeplearning4j.org/usingrnns
next to the recommendation to post this in the very active gitter and the hint of Adam to check out the great documentation, which explains how to set up the in- and output being of rank 3, I want to point out a few other things in your code, as I was struggling with similar problems: check out the basic example here in examples/recurrent/basic/BasicRNNExample.java, here you see that for RNN you don't use model.output(xItem), but model.rnnTimeStep(xItem); with class count of one you seem to be performing a regression, for that also check out the regression examples at examples/feedforward/regression/RegressionSum.java and documenation here, here you see that as an activiation function you should use "identity". "softmax" actually normalizes the output to sum up to one (see in glossary), so if you have just one output it will always output 1 (at least it did for my problem).
Not sure if I understand your requirements correctly, but if you want single output (that is predict a number or regression), you usually go with Identity activation, and MSE loss function. You've used softmax, which is usually used in classificatoin.
Program doesn't work without an initial value
The program works fine with var dig = 0 and it doesn't work with var dig:Int I get an error: Variable "dig" used before being initialized Could you explain me why? func myFunc(a:Int, b:Int) { var c = a / b var o = a % b var v = 0 var dig = 0 if o != 0 {println("\(a)/\(b) = \(c) и \(o)/\(b)")} else {println("\(a)/\(b) = \(c)")} if a > b { v = b } else { v = a } for var i = 1; i <= v; ++i { if a % i == 0 && b % i == 0 {dig = i} } println("\(dig) - greatest common denominator of \(a) and \(b)") } myFunc(27,81)
The only place you set the value of dig is inside of an if statement that is inside of a for loop. The Swift compiler does not know if the body of the for loop will be executed, and it doesn't know if the if statement will ever be true, so it has to assume that there is a path in which dig is not initialized. Consider this simpler example: func myFunc(a:Int, b:Int) { var dig: Int if a >= b { dig = 3 } if a < b { dig = 4 } println("\(dig) - greatest common denominator of \(a) and \(b)") } This example also gives the same error, because Swift considers each if separately. It is obvious to us that a is either greater than or equal to b or it is less than b, but Swift doesn't go that far in evaluating the situation. It just considers that each if may not be true, and dig is only set inside of ifs, so it is possible (as far as Swift is concerned) that dig may not be set. func myFunc(a:Int, b:Int) { var dig: Int if a >= b { dig = 3 } else { dig = 4 } println("\(dig) - greatest common denominator of \(a) and \(b)") } If you change the second condition to an else, Swift is then happy because it can reason that the if must be true or false and dig is set in each path, so it will certainly have a value before the println statement.
The compiler does not know mathematics good enough to recognize that the statement if a % i == 0 && b % i == 0 {dig = i} is actually executed at least once (for i == 1). Therefore the compiler assumes that dig might be undefined at println("\(dig) - greatest common denominator of \(a) and \(b)") Assigning an initial value in var dig = 0 is the correct solution. Btw., the Euclidean Algorithm is a much more effective method to compute the greatest common divisor, see for example http://rosettacode.org/wiki/Greatest_common_divisor#Swift.