I am working through the Project Euler puzzles (https://projecteuler.net/) using F#. It turns out that a lot of the puzzles require the generation of prime numbers, which is an expensive computation.
To make my life easier, I want to create a module that will generate prime numbers, but also write them to a text file so I just look them up the next time I need them.
Here is my prime generator module:
module Utilities
open System
open System.IO
open System.Reflection
module Primes =
//This is where the cache file is
let private cachePath =
Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location)
+ "\PrimeCache.txt"
//Parses the file to an int list
let private readCache : int list =
let text = File.ReadAllText(cachePath).Trim()
text.Split([|','|])
|> Seq.filter(fun w -> w |> String.length > 0)
|> Seq.map(fun w -> Int32.Parse(w))
|> Seq.toList
//Writes an int list to the file
let writeCache cache =
let text = String.Join(",", cache |> Seq.map(fun n -> n.ToString()))
File.WriteAllText(cachePath, text)
//In-memory cache
let mutable cache = []
cache <- readCache
let private appendElement element list =
element::(list |> List.rev) |> List.rev
let private isPrimeInner (n : int) =
if n < 2 then false
else
let len = cache.Length
//Factor can't be greater than the square root
let factorLimit = Convert.ToInt32(Math.Ceiling(Math.Sqrt(Convert.ToDouble(n))))
let mutable i = 0
let mutable stop = false
let mutable isPrime = true
while stop = false do
//Get the next element from cache
let p = cache.[i]
i <- i+1
//Don't go past last index of cache
if i >= len
then stop <- true
else ()
//Don't check primes > sqrt(n)
if p > factorLimit
then stop <- true
else ()
//If its divisible by any prime, its not a prime
if n % p = 0
then
stop <- true
isPrime <- false
else ()
//Add primes to cache
if isPrime
then cache <- appendElement n cache
else ()
isPrime
let isPrime (n : int) : bool =
let result = isPrimeInner n
result
let getPrimes : int seq =
seq {
for p in cache do
yield p
let next = if cache.Length > 0
then cache.[cache.Length-1]+1
else 2
for n = next to Int32.MaxValue do
if isPrimeInner n
then yield n
else ()
}
let saveCache : unit =
writeCache cache
Here is the code for the problem I am currently working on that consumes this. (The actual problem is to sum all primes under 2,000,000, but I am testing on 1000 for now):
module Problem10
let getAnswer =
let limit = 1000
let primes = Utilities.Primes.getPrimes
|> Seq.takeWhile(fun p -> p < limit)
let result = primes |> Seq.sum
Utilities.Primes.saveCache
result
This will give the right answer, but will save an empty cache file back to the disk. If I step through it in the debugger, the call to saveCache is one of the first things that is hit, before any breakpoints I set inside isPrimeInner. If I switch the calling code to this:
module Problem10
let getAnswer =
let limit = 1000
let primes = Utilities.Primes.getPrimes
|> Seq.takeWhile(fun p -> p < limit)
let result = primes |> Seq.sum
let x = Utilities.Primes.cache
Utilities.Primes.writeCache x
result
the cache will be properly saved with new primes.
I've seen this kind of behavior before in C# when misusing iterator blocks. I don't understand how these two versions of the calling code are any different at runtime. What am I doing wrong here?
Related
I am attempting to generate a series of guesses for the second Taxicab number. What I want to do is is call the Attempt function on a series of integers in a finite sequence. I have my two questions about implementation in the comments.
A taxi cab number, in case your wondering, is the least number that satisfied the sum of 2 unique cubes in for n unique sets of 2 unique cubes. Ta(2) is 1729.
[<EntryPoint>]
let main argv =
let Attempt (start : int) =
let stop = start+20
let integerList = [start..stop]
let list = List.init 3 (fun x -> integerList.[x])
//Is there a simple way to make initialize the list with random indices of integerList?
let Cube x = x*x*x
let newlist = list |> List.map (fun x -> Cube x)
let partitionList (x : List<int>) (y : int) = List.sum [x.[y];x.[y+1]]
let intLIST = [0..2]
let partitionList' = [for i in intLIST do yield partitionList newlist i]
let x = Set.ofList partitionList'
let y = Set.ofList partitionList'
//I was going to try to use some kind of equality operator to determine whether the two sets were equal, which could tell me whether we had actually found a Taxicab number by the weakened definition.
System.Console.Write(list)
System.Console.Write(newlist)
let rnd = System.Random()
//My primary question is how can I convert a random to an integer to use in start for the function Attempt?
System.Console.ReadKey() |> ignore
printfn("%A") argv
0
Dirty way to initialize list with random indexes of another list:
let randomIndexes count myList =
let rand = System.Random()
seq {
for n = 1 to count do
yield rand.Next(List.length myList) }
|> Seq.distinct
//|> Seq.sort // if you need them sorted
|> List.ofSeq
let result = randomIndexes 5 [3;2;4;5]
printfn "%A" result
I have an array of items, from which I'd like to sample.
I was under the impression that a Set would the a good structure to sample from, in a fold where I'd give back the original or a modified set with the retrieved element missing depending if I want replacement of not.
However, there seems to no method to retrieve an element directly from a Set.
Is there something I am missing ? or should I use Set of indices, along with a surrogate function that starts at some random position < Set.count and goes up until it finds a member ?
That is, something along this line
module Seq =
let modulo (n:int) start =
let rec next i = seq { yield (i + 1)%n ; yield! next (i+1)}
next start
module Array =
let Sample (withReplacement:bool) seed (entries:'T array) =
let prng, indexes = new Random(seed), Set(Seq.init (entries |> Array.length) id)
Seq.unfold (fun set -> let N = set |> Set.count
let next = Seq.modulo N (prng.Next(N)) |> Seq.truncate N |> Seq.tryFind(fun i -> set |> Set.exists ((=) i))
if next.IsSome then
Some(entries.[next.Value], if withReplacement then set else Set.remove next.Value set)
else
None)
Edit : Tracking positively what I gave, instead of tracking what I still can give would make it simpler and more efficient.
For sampling without replacement, you could just permute the source seq and take however many elements you want to sample
let sampleWithoutReplacement n s =
let a = Array.ofSeq s
seq { for i = a.Length downto 1 do
let j = rnd.Next i
yield a.[j]
a.[j] <- a.[i - 1] }
|> Seq.take n
To sample with replacement, just pick a random element n times from the source seq
let sampleWithReplacement n s =
let a = Array.ofSeq s
Seq.init n (fun _ -> a.[rnd.Next(a.Length)])
These may not be the most efficient methods with huge data sets however
Continuing our comments...if you want to randomly sample a sequence without slurping the entire thing into memory you could generate a set of random indices the size of your desired sample (not too different from what you already have):
let rand count max =
System.Random()
|> Seq.unfold (fun r -> Some(r.Next(max), r))
|> Seq.distinct
|> Seq.take count
|> set
let takeSample sampleSize inputSize input =
let indices = rand sampleSize inputSize
input
|> Seq.mapi (fun idx x ->
if Set.contains idx indices then Some x else None)
|> Seq.choose id
let inputSize = 100000
let input = Seq.init inputSize id
let sample = takeSample 50 inputSize input
printfn "%A" (Seq.toList sample)
Motivation
I have a long-running boolean function which should be executed in an array and I want to return immediately if an element in the array satisfies the condition. I would like to do the search in parallel and terminate other threads when the first complete thread returns an correct answer.
Question
What is a good way to implement parallel exists function in F#? Since my goal is performance, an efficient solution is preferred to an easy or idiomatic one.
Test case
Suppose that I want to find whether one value exists in an array or not. And the comparison function (equals) is simulated as a computation-expensive one:
open System.Diagnostics
open System.Threading
// Source at http://parallelpatterns.codeplex.com/releases/view/50473
let doCpuIntensiveOperation seconds (token:CancellationToken) throwOnCancel =
if (token.IsCancellationRequested) then
if (throwOnCancel) then token.ThrowIfCancellationRequested()
false
else
let ms = int64 (seconds * 1000.0)
let sw = new Stopwatch()
sw.Start()
let checkInterval = Math.Min(20000000, int (20000000.0 * seconds))
// Loop to simulate a computationally intensive operation
let rec loop i =
// Periodically check to see if the user has requested
// cancellation or if the time limit has passed
let check = seconds = 0.0 || i % checkInterval = 0
if check && token.IsCancellationRequested then
if throwOnCancel then token.ThrowIfCancellationRequested()
false
elif check && sw.ElapsedMilliseconds > ms then
true
else
loop (i + 1)
// Start the loop with 0 as the first value
loop 0
let inline equals x y =
doCpuIntensiveOperation 0.01 CancellationToken.None false |> ignore
x = y
The array consists of 1000 randomly generated elements and the searching value is guaranteed in the 2nd half of the array (so sequential search has to go through at least a half of the array):
let rand = new System.Random()
let m = 1000
let N = 1000000
let xs = [|for _ in 1..m -> rand.Next(N)|]
let i = rand.Next((m-1)/2, m-1);;
#time "on";;
let b1 = parallelExists (equals xs.[i]) xs;; // Parallel
let b2 = Array.exists (equals xs.[i]) xs;; // Sequential
I think you can take the following steps:
Spawn a number of workers (threads or async computations), and pass each an equal slice of the array and a cancellation token which will be shared by all workers
When a worker finds the searched item, it calls Cancel on the token (each worker should check the cancel state of the token on each iteration and bail if needed)
I don't have time at the moment to write the code, so there could be some detail I'm omitting.
This answer, and related question, may be helpful.
UPDATE
This is an example of what I'm thinking
open System
open System.Collections.Generic
open System.Threading
open System.Threading.Tasks
let getChunks size array =
let rec loop s n =
seq {
if n > 0 then
let r = n - size
if r > 0 then yield (s, size); yield! loop (s + size) r
else yield (s, size + r)
}
loop 0 (Array.length array)
[<Literal>]
let CHUNK_SIZE = 3
let parallelExists f (array:_[]) =
use cts = new CancellationTokenSource()
let rec checkSlice i n =
if n > 0 && not cts.IsCancellationRequested then
if f array.[i] then cts.Cancel()
else checkSlice (i + 1) (n - 1)
let workers =
array
|> getChunks CHUNK_SIZE
|> Seq.map (fun (s, c) -> Task.Factory.StartNew(fun () -> checkSlice s c))
|> Seq.toArray
try
Task.WaitAll(workers, cts.Token)
false
with :? OperationCanceledException -> true
Usage
let array = Array.init 10 id
let exists =
array |> parallelExists (fun i ->
Thread.Sleep(500)
i = 9)
printfn "%b" exists //true
The F# Powerpack has PSeq.exists which maps to PLINQ's ParallelEnumerable.Any which is part of the BCL. There's also ParallelEnumerable.First
I tried to decompile but wouldn't understand right away what was going on. So instead I went and executed the following side-effecting code to confirm that it's using some sort of cancellation once it found the element:
let elems = seq {
for x = 0 to 1000000 do
printfn "test"
yield x }
open System
open System.Linq;;
ParallelEnumerable.First (ParallelEnumerable.AsParallel(elems), Func<_,_>(fun x -> x = 1))
The following F# code gives the correct answer to Project Euler problem #7:
let isPrime num =
let upperDivisor = int32(sqrt(float num)) // Is there a better way?
let rec evaluateModulo a =
if a = 1 then
true
else
match num % a with
| 0 -> false
| _ -> evaluateModulo (a - 1)
evaluateModulo upperDivisor
let mutable accumulator = 1 // Would like to avoid mutable values.
let mutable number = 2 // ""
while (accumulator <= 10001) do
if (isPrime number) then
accumulator <- accumulator + 1
number <- number + 1
printfn "The 10001st prime number is %i." (number - 1) // Feels kludgy.
printfn ""
printfn "Hit any key to continue."
System.Console.ReadKey() |> ignore
I'd like to avoid the mutable values accumulator and number. I'd also like to refactor the while loop into a tail recursive function. Any tips?
Any ideas on how to remove the (number - 1) kludge which displays the result?
Any general comments about this code or suggestions on how to improve it?
Loops are nice, but its more idiomatic to abstract away loops as much as possible.
let isPrime num =
let upperDivisor = int32(sqrt(float num))
match num with
| 0 | 1 -> false
| 2 -> true
| n -> seq { 2 .. upperDivisor } |> Seq.forall (fun x -> num % x <> 0)
let primes = Seq.initInfinite id |> Seq.filter isPrime
let nthPrime n = Seq.nth n primes
printfn "The 10001st prime number is %i." (nthPrime 10001)
printfn ""
printfn "Hit any key to continue."
System.Console.ReadKey() |> ignore
Sequences are your friend :)
You can refer my F# for Project Euler Wiki:
I got this first version:
let isPrime n =
if n=1 then false
else
let m = int(sqrt (float(n)))
let mutable p = true
for i in 2..m do
if n%i =0 then p <- false
// ~~ I want to break here!
p
let rec nextPrime n =
if isPrime n then n
else nextPrime (n+1)
let problem7 =
let mutable result = nextPrime 2
for i in 2..10001 do
result <- nextPrime (result+1)
result
In this version, although looks nicer, but I still does not early break the loop when the number is not a prime. In Seq module, exist and forall methods support early stop:
let isPrime n =
if n<=1 then false
else
let m = int(sqrt (float(n)))
{2..m} |> Seq.exists (fun i->n%i=0) |> not
// or equivalently :
// {2..m} |> Seq.forall (fun i->n%i<>0)
Notice in this version of isPrime, the function is finally mathematically correct by checking numbers below 2.
Or you can use a tail recursive function to do the while loop:
let isPrime n =
let m = int(sqrt (float(n)))
let rec loop i =
if i>m then true
else
if n%i = 0 then false
else loop (i+1)
loop 2
A more functional version of problem7 is to use Seq.unfold to generate an infinite prime sequence and take nth element of this sequence:
let problem7b =
let primes =
2 |> Seq.unfold (fun p ->
let next = nextPrime (p+1) in
Some( p, next ) )
Seq.nth 10000 primes
Here's my solution, which uses a tail-recursive loop pattern which always allows you to avoid mutables and gain break functionality: http://projecteulerfun.blogspot.com/2010/05/problem-7-what-is-10001st-prime-number.html
let problem7a =
let isPrime n =
let nsqrt = n |> float |> sqrt |> int
let rec isPrime i =
if i > nsqrt then true //break
elif n % i = 0 then false //break
//loop while neither of the above two conditions are true
//pass your state (i+1) to the next call
else isPrime (i+1)
isPrime 2
let nthPrime n =
let rec nthPrime i p count =
if count = n then p //break
//loop while above condition not met
//pass new values in for p and count, emulating state
elif i |> isPrime then nthPrime (i+2) i (count+1)
else nthPrime (i+2) p count
nthPrime 1 1 0
nthPrime 10001
Now, to specifically address some of the questions you had in your solution.
The above nthPrime function allows you to find primes at an arbitrary position, this is how it would look adapted to your approach of finding specifically the 1001 prime, and using your variable names (the solution is tail-recursive and doesn't use mutables):
let prime1001 =
let rec nthPrime i number accumulator =
if accumulator = 1001 then number
//i is prime, so number becomes i in our next call and accumulator is incremented
elif i |> isPrime then prime1001 (i+2) i (accumulator+1)
//i is not prime, so number and accumulator do not change, just advance i to the next odd
else prime1001 (i+2) number accumulator
prime1001 1 1 0
Yes, there is a better way to do square roots: write your own generic square root implementation (reference this and this post for G implementation):
///Finds the square root (integral or floating point) of n
///Does not work with BigRational
let inline sqrt_of (g:G<'a>) n =
if g.zero = n then g.zero
else
let mutable s:'a = (n / g.two) + g.one
let mutable t:'a = (s + (n / s)) / g.two
while t < s do
s <- t
let step1:'a = n/s
let step2:'a = s + step1
t <- step2 / g.two
s
let inline sqrtG n = sqrt_of (G_of n) n
let sqrtn = sqrt_of gn //this has suffix "n" because sqrt is not strictly integral type
let sqrtL = sqrt_of gL
let sqrtI = sqrt_of gI
let sqrtF = sqrt_of gF
let sqrtM = sqrt_of gM
First, in order to provide full disclosure, I want to point out that this is related to homework in a Machine Learning class. This question is not the homework question and instead is something I need to figure out in order to complete the bigger problem of creating an ID3 Decision Tree Algorithm.
I need to generate tree similar to the following when given a truth table
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
learnedTree is of type BinaryTree which I've defined as follows:
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
ID3 algorithms take into account various equations to determine where to split the tree, and I've got all that figured out, I'm just having trouble creating the learned tree from my truth table. For example if I have the following table
A1 | A2 | A3 | Class
1 0 0 1
0 1 0 1
0 0 0 0
1 0 1 0
0 0 0 0
1 1 0 1
0 1 1 0
And I decide to split on attribute A1 I would end up with the following:
(A1 = 1) A1 (A1 = 0)
A2 | A3 | Class A2 | A3 | Class
0 0 1 1 0 1
0 1 0 0 0 0
1 0 1 0 0 0
0 1 1
Then I would split the left side and split the right side, and continue the recursive pattern until the leaf nodes are pure and I end up with a tree similar to the following based on the splitting.
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
Here is what I've kind of "hacked" together thus far, but I think I might be way off:
let rec createTree (listToSplit : list<list<float>>) index =
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 0. then Some(x) else None)
if leftSideSplit.Length > 0 then
let pureCheck = isListPure leftSideSplit
if pureCheck = 0 then
printfn "%s" "Pure left node class 0"
createTree leftSideSplit (index + 1)
else if pureCheck = 1 then
printfn "%s" "Pure left node class 1"
createTree leftSideSplit (index + 1)
else
printfn "%s - %A" "Recursing Left" leftSideSplit
createTree leftSideSplit (index + 1)
else printfn "%s" "Pure left node class 0"
Should I be using pattern matching instead? Any tips/ideas/help? Thanks a bunch!
Edit: I've since posted an implementation of ID3 on my blog at:
http://blogs.msdn.com/chrsmith
Hey Jim, I've been wanting to write a blog post implementing ID3 in F# for a while - thanks for giving me an execute. While this code doesn't implement the algorithm full (or correctly), it should be sufficient for getting you started.
In general you have the right approach - representing each branch as a discriminated union case is good. And like Brian said, List.partition is definitely a handy function. The trick to making this work correctly is all in determining the optimal attribute/value pair to split on - and to do that you'll need to calculate information gain via entropy, etc.
type Attribute = string
type Value = string
type Record =
{
Weather : string
Temperature : string
PlayTennis : bool
}
override this.ToString() =
sprintf
"{Weather = %s, Temp = %s, PlayTennis = %b}"
this.Weather
this.Temperature
this.PlayTennis
type Decision = Attribute * Value
type DecisionTreeNode =
| Branch of Decision * DecisionTreeNode * DecisionTreeNode
| Leaf of Record list
// ------------------------------------
// Splits a record list into an optimal split and the left / right branches.
// (This is where you use the entropy function to maxamize information gain.)
// Record list -> Decision * Record list * Record list
let bestSplit data =
// Just group by weather, then by temperature
let uniqueWeathers =
List.fold
(fun acc item -> Set.add item.Weather acc)
Set.empty
data
let uniqueTemperatures =
List.fold
(fun acc item -> Set.add item.Temperature acc)
Set.empty
data
if uniqueWeathers.Count = 1 then
let bestSplit = ("Temperature", uniqueTemperatures.MinimumElement)
let left, right =
List.partition
(fun item -> item.Temperature = uniqueTemperatures.MinimumElement)
data
(bestSplit, left, right)
else
let bestSplit = ("Weather", uniqueWeathers.MinimumElement)
let left, right =
List.partition
(fun item -> item.Weather = uniqueWeathers.MinimumElement)
data
(bestSplit, left, right)
let rec determineBranch data =
if List.length data < 4 then
Leaf(data)
else
// Use the entropy function to break the dataset on
// the category / value that best splits the data
let bestDecision, leftBranch, rightBranch = bestSplit data
Branch(
bestDecision,
determineBranch leftBranch,
determineBranch rightBranch)
// ------------------------------------
let rec printID3Result indent branch =
let padding = new System.String(' ', indent)
match branch with
| Leaf(data) ->
data |> List.iter (fun item -> printfn "%s%s" padding <| item.ToString())
| Branch(decision, lhs, rhs) ->
printfn "%sBranch predicate [%A]" padding decision
printfn "%sWhere predicate is true:" padding
printID3Result (indent + 4) lhs
printfn "%sWhere predicate is false:" padding
printID3Result (indent + 4) rhs
// ------------------------------------
let dataset =
[
{ Weather = "windy"; Temperature = "hot"; PlayTennis = false }
{ Weather = "windy"; Temperature = "cool"; PlayTennis = false }
{ Weather = "nice"; Temperature = "cool"; PlayTennis = true }
{ Weather = "nice"; Temperature = "cold"; PlayTennis = true }
{ Weather = "humid"; Temperature = "hot"; PlayTennis = false }
]
printfn "Given input list:"
dataset |> List.iter (printfn "%A")
printfn "ID3 split resulted in:"
let id3Result = determineBranch dataset
printID3Result 0 id3Result
You can use List.partition instead of your two List.choose calls.
http://research.microsoft.com/en-us/um/cambridge/projects/fsharp/manual/FSharp.Core/Microsoft.FSharp.Collections.List.html
(or now http://msdn.microsoft.com/en-us/library/ee353738(VS.100).aspx )
It isn't clear to me that pattern matching will buy you much here; the input type (list of lists) and processing (partitioning and 'pureness' check) doesn't really lend itself to that.
And of course when you finally get the 'end' (a pure list) you need to create a tree, and then presumably this function will create a Leaf when the input only has one 'side' and it's 'pure', but create a Node out of the left-side and right-side results for every other input. Maybe. I didn't quite grok the algorithm completely.
Hopefully that will help steer you a little bit. May be useful to draw up a few smaller sample inputs and outputs to help work out the various cases of the function body.
Thanks Brian & Chris! I was actually able to figure this out and I ended up with the following. This calculates the information gain for determining the best place to split. I'm sure there are probably better ways for me to arrive at this solution especially around the chosen data structures, but this is a start. I plan to refine things later.
#light
open System
let trainList =
[
[1.;0.;0.;1.;];
[0.;1.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;1.;0.;];
[0.;0.;0.;0.;];
[1.;1.;0.;1.;];
[0.;1.;1.;0.;];
[1.;0.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;0.;1.;];
]
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
let entropyList nums =
let sumOfnums =
nums
|> Seq.sum
nums
|> Seq.map (fun x -> if x=0.00 then x else (-((x/sumOfnums) * Math.Log(x/sumOfnums, 2.))))
|> Seq.sum
let entropyBinaryList (dataListOfLists:list<list<float>>) =
let classList =
dataListOfLists
|> List.map (fun x -> x.Item(x.Length - 1))
let ListOfNo =
classList
|> List.choose (fun x -> if x = 0. then Some(x) else None)
let ListOfYes =
classList
|> List.choose (fun x -> if x = 1. then Some(x) else None)
let numberOfYes : float = float ListOfYes.Length
let numberOfNo : float = float ListOfNo.Length
let ListOfNumYesAndSumNo = [numberOfYes; numberOfNo]
entropyList ListOfNumYesAndSumNo
let conditionalEntropy (dataListOfLists:list<list<float>>) attributeNumber =
let NoAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 0. then Some(x) else None)
let YesAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 1. then Some(x) else None)
let numberOfYes : float = float YesAttributeList.Length
let numberOfNo : float = float NoAttributeList.Length
let noConditionalEntropy = (entropyBinaryList NoAttributeList) * (numberOfNo/(numberOfNo + numberOfYes))
let yesConditionalEntropy = (entropyBinaryList YesAttributeList) * (numberOfYes/(numberOfNo + numberOfYes))
[noConditionalEntropy; yesConditionalEntropy]
let findBestSplitIndex(listOfInstances : list<list<float>>) =
let IGList =
[0..(listOfInstances.Item(0).Length - 2)]
|> List.mapi (fun i x -> (i, (entropyBinaryList listOfInstances) - (List.sum (conditionalEntropy listOfInstances x))))
IGList
|> List.maxBy snd
|> fst
let isListPure (listToCheck : list<list<float>>) =
let splitList = listToCheck |> List.choose (fun x -> if x.Item(x.Length - 1) = 1. then Some(x) else None)
if splitList.Length = listToCheck.Length then 1
else if splitList.Length = 0 then 0
else -1
let rec createTree (listToSplit : list<list<float>>) =
let pureCheck = isListPure listToSplit
if pureCheck = 0 then
printfn "%s" "Pure - Leaf(0)"
else if pureCheck = 1 then
printfn "%s" "Pure - Leaf(1)"
else
printfn "%A - is not pure" listToSplit
if listToSplit.Length > 1 then // There are attributes we can split on
// Chose best place to split list
let splitIndex = findBestSplitIndex(listToSplit)
printfn "spliting at index %A" splitIndex
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 0. then Some(x) else None)
createTree leftSideSplit
createTree rightSideSplit
else
printfn "%s" "Not Pure, but can't split choose based on heuristics - Leaf(0 or 1)"