Is there an idiomatic way in F# to look ahead in a list/seq/array and use the information learned in the processing of the current item? In my scenario it would also be necessary to mutate (or otherwise store the fact it was changed) the ahead item so it is processed correctly in turn. I'm implementing some rather silly business rules and such a pattern or technique would be useful.
Right now I'm using an accumulator to store the information and then mutating items of an array as I process each. This feels a bit clumsy as you can see in the simplified example below. The actual business rules for the problem I'm solving are more complex so I rather not trudge down this path if there is a better way. Essentially, I want to get rid of graceMonths in the Acc type and instead solve those months by looking ahead in the list/seq/array.
The mock example: Workers get some type of bonus when they reach a desired level of production each month. If they fail to meet the desired level they can make up for it by exceeding the level in following months. Likewise, they can bank excess production for use in future months where they fall short. The following script shows an example.
type CalendarMonth =
{ year : int
month : int }
type InMonth =
{ month : CalendarMonth
prodCount : int }
type OutMonth =
{ month : CalendarMonth
prodCount : int
borrowedFrom : InMonth list
metProd : bool }
type OutMonthAcc =
{ outMonth : OutMonth
notUsed : InMonth list }
type IndexOutMonth =
{ index : int
outMonth : OutMonth }
type Acc =
{ index : int
graceMonths : IndexOutMonth list
bankedProd : InMonth list
arrRef : OutMonth array }
type GraceAcc =
{ processed : IndexOutMonth list
notUsed : InMonth list }
let createMonth y m c =
{ InMonth.month =
{ year = y
month = m }
prodCount = c }
let toOutPutMonth (x : InMonth) =
{ month = x.month
prodCount = x.prodCount
borrowedFrom = []
metProd = false }
let toSimple (x : OutMonth) = sprintf "year: %i, month: %i, metProd: %b" x.month.year x.month.month x.metProd
let solveWithBanked desiredProd bank m =
let useProd (acc : OutMonthAcc) inMonth =
let m = acc.outMonth
if m.metProd then
{ acc with notUsed = inMonth :: acc.notUsed }
else
let borrowed = m.borrowedFrom |> List.sumBy (fun x -> x.prodCount)
let needed = desiredProd - (m.prodCount + borrowed)
match inMonth.prodCount with
| x when x < needed ->
{ outMonth = { m with borrowedFrom = inMonth :: m.borrowedFrom }
notUsed = acc.notUsed }
| x when x > needed ->
let newInMonth = { inMonth with prodCount = inMonth.prodCount - needed }
let newOutMonth =
{ m with borrowedFrom = newInMonth :: m.borrowedFrom
metProd = true }
{ outMonth = newOutMonth
notUsed = newInMonth :: acc.notUsed }
| _ ->
{ outMonth =
{ m with borrowedFrom = inMonth :: m.borrowedFrom
metProd = true }
notUsed = acc.notUsed }
bank |> List.fold useProd { outMonth = m
notUsed = [] }
let solveGrace desiredProd bank (graceLst : IndexOutMonth list) =
let useBank acc iOutMonth =
let result = iOutMonth.outMonth |> solveWithBanked desiredProd acc.notUsed
if result.outMonth.metProd then
let iMonth =
{ index = iOutMonth.index
outMonth = result.outMonth }
{ processed = iMonth :: acc.processed
notUsed = result.notUsed }
else { acc with processed = iOutMonth :: acc.processed }
graceLst
|> List.sortBy (fun x -> x.index)
|> List.fold useBank { processed = []
notUsed = bank }
let solve desiredProd acc m =
match m.prodCount < desiredProd with
| true -> // less
let result = m |> solveWithBanked desiredProd acc.bankedProd
if result.outMonth.metProd then
acc.arrRef.[acc.index] <- result.outMonth
{ acc with index = acc.index + 1
bankedProd = result.notUsed }
else
let iMonth =
{ IndexOutMonth.index = acc.index
outMonth = m }
{ acc with index = acc.index + 1
graceMonths = iMonth :: acc.graceMonths }
| false -> // greater
let newM =
{ index = acc.index
outMonth = { m with metProd = true } }
let newIn =
{ InMonth.month = m.month
prodCount = m.prodCount - desiredProd }
let result = acc.graceMonths |> solveGrace desiredProd (newIn :: acc.bankedProd)
let solved, unsolved = result.processed |> List.partition (fun x -> x.outMonth.metProd)
newM :: solved |> List.iter (fun x -> acc.arrRef.[x.index] <- x.outMonth)
{ acc with index = acc.index + 1
graceMonths = unsolved
bankedProd = result.notUsed }
let jan = createMonth 2013 01 4
let feb = createMonth 2013 02 4
let mar = createMonth 2013 03 6
let apr = createMonth 2013 04 7
let may = createMonth 2013 05 4
let jun = createMonth 2013 06 4
let arr =
jan :: feb :: mar :: apr :: may :: jun :: []
|> Array.ofList
|> Array.map toOutPutMonth
arr |> Array.fold (solve 5) { index = 0
graceMonths = []
bankedProd = []
arrRef = arr }
let result =
arr
|> Array.map toSimple
|> List.ofArray
The value of result should show all months met production except June. Is this the right approach in F# or is there a better way?
This is the approach I would try here:
Calculate the differences between the expected and actual amounts for each month upfront,
Split the months into three groups - the ones that are below the quota, above the quota and ones that exactly make the quota,
Attempt to balance the ones below the quota with the ones above the quota.
The first two points seem pretty self-explanatory to me, as for the third one, here's the draft of the balance function and an example usage:
let (|Lt|Eq|Gt|) (a, b) =
if a = b
then Eq
elif a > b then Gt else Lt
let rec balance below above balanced =
match below, above with
| (x, required)::xs, (y, available)::ys ->
match required, available with
| Lt -> balance xs ((y, available - required) :: ys) (x::balanced)
| Eq -> balance xs ys (x::y::balanced)
| Gt -> balance ((x, required - available) :: xs) ys (y::balanced)
| _, _ ->
below, above, balanced
balance [("a", 4); ("b", 1)] [ ("c", 2); ("d", 2) ] [ "e" ]
balance [("a", 1); ("b", 1)] [ ("c", 2); ("d", 2) ] [ "e" ]
Essentially you walk the two lists in parallel, "taking" from the one and "adding" to the other, until you run out of either one. What remains is the best attempt at making things balanced.
Typically you want to use collection APIs like the List module when writing F# code, but it's useful to remember that you can always fall back to "raw" recursion when your use case doesn't seem to fit into existing schemes.
Related
I'm trying to return a list from a function, but I'm getting an error that says that an unit was expected instead. Also, I would like to know if this code appears to be structured correctly in general.
code:
let rec calculateVariants (attList: NewProductAttributeInfo list) (activeCount: int)
(currentList: (int * NewProductAttributeInfo) list) =
// group attribute list by category id
let attGrouped = attList |> List.groupBy (fun x -> x.AttributeCategoryId)
// define mutable list
let mutable stageList = currentList
// begin iteration
for catId,details in attGrouped do
for d in details do
if activeCount = 0
then stageList <- (activeCount,d) :: stageList
let groupLength = attGrouped.Length
if (activeCount + 1) <= groupLength
then
let selectCat,selectDetails = attGrouped.[activeCount + 1]
selectDetails
|> List.filter (fun x ->
stageList
|> List.exists (fun (x') ->
not(x' = (activeCount,x))))
|> (fun x ->
match x with
| [] -> ()
| head :: tail ->
stageList <- (activeCount, head) :: stageList
let currentCategory = activeCount + 1
calculateVariants attList currentCategory stageList
)
stageList // <-- error Unit expected
if .. then .. else should return the same type on both branches. If you omit else branch then compiler assuming that it returns unit. Add else branch returning list.
Edit:
Given your problem description, the easiest way would be something like this:
type NewProductAttributeInfo = {AttributeCategoryId: string; AttributeId: string}
let products = [ { AttributeCategoryId = "Size"; AttributeId = "S"};
{ AttributeCategoryId = "Mat"; AttributeId = "Linen" };
{ AttributeCategoryId = "Mat"; AttributeId = "Poliester" };
{ AttributeCategoryId = "Color"; AttributeId = "White" };
{ AttributeCategoryId = "Color"; AttributeId = "Green" };
{ AttributeCategoryId = "Mat"; AttributeId = "Linen" };
{ AttributeCategoryId = "Mat"; AttributeId = "Cotton" };
{ AttributeCategoryId = "Mat"; AttributeId = "Poliester" };
{ AttributeCategoryId = "Size"; AttributeId = "XL" } ]
let group list =
list
|> Set.ofList // Provides uniqueness of attribute combinations
|> Seq.groupBy (fun x -> x.AttributeCategoryId) // Grouping by CatId
|> List.ofSeq
let res = group products
Result:
val it : (string * seq<NewProductAttributeInfo>) list =
[("Color", seq [{AttributeCategoryId = "Color";
AttributeId = "Green";}; {AttributeCategoryId = "Color";
AttributeId "White";}]);
("Mat",
seq
[{AttributeCategoryId = "Mat";
AttributeId = "Cotton";}; {AttributeCategoryId = "Mat";
AttributeId = "Linen";};
{AttributeCategoryId = "Mat";
AttributeId = "Poliester";}]);
("Size", seq [{AttributeCategoryId = "Size";
AttributeId = "S";}; {AttributeCategoryId = "Size";
AttributeId = "XL";}])]
This is the solution that I came with. It works, but I'm sure it can be optimized quite a bit. I have a duplicate issue that is solved with the Set.ofList function externally after this code runs, which I'm still working on.
type NewProductAttributeInfo = {
AttributeId : string;
AttributeCategoryId : string
}
let rec private returnVariant (curIdx: int) (listLength: int)
(attList: (int * NewProductAttributeInfo * NewProductAttributeInfo) list)
(curList: NewProductAttributeInfo list) =
match curList with
| x when x.Length = listLength -> curList
| x ->
let attTup =
attList
|> List.filter (fun x' ->
let idx1,att1,att2' = x'
idx1 >= curIdx && not(curList
|> List.exists (fun x'' ->
x'' = att2'))
)
let idx1,att1,att2 = attTup |> List.head
let newList = curList # [att2]
returnVariant idx1 newList.Length attList newList
let rec calculateVariants (attList: NewProductAttributeInfo list)
(currentList: (int * NewProductAttributeInfo * NewProductAttributeInfo) list) =
// group attribute list by category id
let attGrouped = attList |> List.groupBy (fun x -> x.AttributeCategoryId)
let (firstGroupCatId,firstGroupDetails) = attGrouped.[0]
match currentList with
| [] ->
let rawVariants = [for nxt in 0 .. (attGrouped.Length - 1) do
if nxt > 0
then
// begin iteration
for d in firstGroupDetails do
let _,det = attGrouped.[nxt]
for det' in det do
yield (nxt, d, det')
]
calculateVariants attList rawVariants
| x ->
let groupLength = x |> List.groupBy (fun (idx,d0,nxtD) -> idx)
|> List.length |> ((+)1)
let sortedGroup = x |> List.sortBy (fun (x,y,z) -> x)
if groupLength > 2
then // below is the block that generates the duplicates
[for att in sortedGroup do
for attCompare in sortedGroup do
let idx1,att1,att2 = att
let idx2,attC1,attC2 = attCompare
if idx2 > idx1 && att2 <> attC2
then
let idString =
returnVariant idx2 groupLength x [att1; att2; attC2]
|> List.map (fun nl -> nl.AttributeId)
yield String.concat "," idString
]
else
[
for att in sortedGroup do
let idx1,att1,att2 = att
let idString =
returnVariant idx1 groupLength x [att1; att2]
|> List.map (fun nl -> nl.AttributeId)
yield String.concat "," idString
]
What would the time complexity be of these two algorithms?
let rec fol f a = function
| [] -> a
| x::xs -> fol f (f a x) xs;;
let mergelist xs = List.fol (#) [] xs
and
let rec folB f xs a =
match xs with
| [] -> a
| y::ys -> f y (folB f ys a);;
let mergelist2 xs = List.folB (#) xs []
and how would i be able to test it my self?
Should return something like
mergelist [[1;2];[];[3];[4;5;6]];;
val it : int list = [1; 2; 3; 4; 5; 6]
Here is a quick&dirty snippet of how you can compare the two operations with n lists of length 3 each:
let rec fol f a = function
| [] -> a
| x::xs -> fol f (f a x) xs;;
let rec folB f xs a =
match xs with
| [] -> a
| y::ys -> f y (folB f ys a);;
let compareThemFor n =
let testList = List.replicate n [1;2;3]
let count = ref 0
let myCons x xs =
incr count
x :: xs
let myApp ys =
List.foldBack myCons ys
let mergelist = fol myApp []
mergelist testList |> ignore
let countA = !count
count := 0
let mergelist2 xs = folB myApp xs []
mergelist2 testList |> ignore
let countB = !count
(countA, countB)
and this is what you will get:
> compareThemFor 2;;
val it : int * int = (3, 6)
> compareThemFor 3;;
val it : int * int = (9, 9)
> compareThemFor 4;;
val it : int * int = (18, 12)
> compareThemFor 5;;
val it : int * int = (30, 15)
> compareThemFor 6;;
val it : int * int = (45, 18)
as you can see the second is far better and I hope the comments above helps you understand why.
Just in case here is the n=3 version for mergelist:
mergelist [[1;2;3];[3;4;5];[6;7;8]]
{ second case in `fol` with `x=[1;2;3]` and `xs=[[3;4;5];[6;7;8]]` }
= fol (#) ([] # [1;2;3]) [[3;4;5];[6;7;8]] // one # of 0 elements = 0 operations
{ second case in `fol` with `x=[3;4;5]` and `xs=[[6;7;8]]` }
= fol (#) ([1;2;3] # [3;4;5]) [[6;7;8]] // one # of 3 elements = 3 operations
{ second case in `fol` with `x=[6;7;8]` and `xs=[]` }
= fol (#) ([1;2;3;3;4;5] # [6;7;8]) [] // one # of 6 elements = 6 operations
{ first case }
= [1;2;3;3;4;5;6;7;8] // 0+3+(3+3)=9 Operations Total
please note that you prepend [1,2,3] multiple times ...
I am using Array.Parallel.map on a function but find that it is not executing at anywhere near full processor capacity. I am assuming this is because the function creates a lot of objects when running List.map and List.map2. Would this be causing a synchronization issue and is there a more appropriate way of doing this? At the moment the only way I can think of getting around this is by running each process as a separate executable using something like xargs under Linux.
I put together the script below to demonstrate the problem. It is a very basic data categorizer which relies on a field having a certain value as a rule to determine if this will predict a category:
open System
type CategoryAssessment =
{ fieldIndex: int
value: int
ruleAssessments: list<int> }
let InitAssessment categorizeFields rules =
let ruleAssessments = List.init (List.length rules) (fun x -> 0)
List.map (fun categorizeField ->
let fieldIndex, categoryValue = categorizeField
{ CategoryAssessment.fieldIndex = fieldIndex;
value = categoryValue;
ruleAssessments = ruleAssessments })
categorizeFields
let AssessCategory ruleMatches (row : int[]) categoryAssessment =
let fieldIndex = categoryAssessment.fieldIndex
let categoryValue = categoryAssessment.value
let categoryMatch = categoryValue = row.[fieldIndex]
let newRuleAssessments =
List.map2 (fun ruleAssessment ruleMatch ->
if ruleMatch = categoryMatch then
ruleAssessment + 1
else
ruleAssessment)
categoryAssessment.ruleAssessments
ruleMatches
{ categoryAssessment with ruleAssessments = newRuleAssessments }
let MatchRule (row : int[]) rule =
let fieldIndex, eqVal = rule
row.[fieldIndex] = eqVal
let Assess categorizeFields rules input =
printfn "START - Assess"
let d =
Array.fold (fun categoryAssessment row ->
let ruleMatches = List.map (MatchRule row) rules
List.map (AssessCategory ruleMatches row) categoryAssessment)
(InitAssessment categorizeFields rules)
input
printfn "END - Assess"
d
let JoinAssessments assessments =
let numAssessments = Array.length assessments
Array.fold (fun accAssessment assessment ->
List.map2 (fun accCategory category ->
let newRuleAssessments =
List.map2 (+)
accCategory.ruleAssessments
category.ruleAssessments
{ accCategory with
ruleAssessments = newRuleAssessments })
accAssessment
assessment)
assessments.[0]
assessments.[1..(numAssessments-1)]
let numRecords = 10000
let numFields = 20
let numSplits = 10
let numRules = 10000
let inputs = Array.create numSplits
[| for i in 1 .. (numRecords / numSplits) ->
[| for j in 1 .. numFields ->
(i % 10) + j |] |]
let categorizeFields = [ (1, 6); (2, 3); (2, 4); (3, 2) ]
let rules = [ for i in 1 .. numRules -> (i % numFields, i) ]
let assessments =
Array.Parallel.map (Assess categorizeFields rules) inputs
|> JoinAssessments
printfn "Assessments: %A" assessments
0
After a fair bit of investigation, the ultimate answer to my question seems to be to find a way of not creating lots of objects. The easiest change to do this is moving to using arrays instead of lists. I have written up my findings more fully in an article: Beware of Immutable Lists for F# Parallel Processing.
The above program when altered as follows, runs better between threads and runs much quicker even on a single thread. Further improvements can be made by making the ruleAssessments field mutable as demonstrated in the referenced article.
open System
type CategoryAssessment =
{ fieldIndex: int
value: int
ruleAssessments: int[] }
let InitAssessment categorizeFields rules =
let ruleAssessments = Array.create (Array.length rules) 0
Array.map (fun categorizeField ->
let fieldIndex, categoryValue = categorizeField
{ CategoryAssessment.fieldIndex = fieldIndex;
value = categoryValue;
ruleAssessments = ruleAssessments })
categorizeFields
let AssessCategory ruleMatches (row : int[]) categoryAssessment =
let fieldIndex = categoryAssessment.fieldIndex
let categoryValue = categoryAssessment.value
let categoryMatch = categoryValue = row.[fieldIndex]
let newRuleAssessments =
Array.map2 (fun ruleAssessment ruleMatch ->
if ruleMatch = categoryMatch then
ruleAssessment + 1
else
ruleAssessment)
categoryAssessment.ruleAssessments
ruleMatches
{ categoryAssessment with ruleAssessments = newRuleAssessments }
let MatchRule (row : int[]) rule =
let fieldIndex, eqVal = rule
row.[fieldIndex] = eqVal
let Assess categorizeFields rules input =
printfn "START - Assess"
let d =
Array.fold (fun categoryAssessment row ->
let ruleMatches = Array.map (MatchRule row) rules
Array.map (AssessCategory ruleMatches row) categoryAssessment)
(InitAssessment categorizeFields rules)
input
printfn "END - Assess"
d
let JoinAssessments assessments =
let numAssessments = Array.length assessments
Array.fold (fun accAssessment assessment ->
Array.map2 (fun accCategory category ->
let newRuleAssessments =
Array.map2 (+)
accCategory.ruleAssessments
category.ruleAssessments
{ accCategory with
ruleAssessments = newRuleAssessments })
accAssessment
assessment)
assessments.[0]
assessments.[1..(numAssessments-1)]
let numRecords = 10000
let numFields = 20
let numSplits = 10
let numRules = 10000
let inputs = Array.create numSplits
[| for i in 1 .. (numRecords / numSplits) ->
[| for j in 1 .. numFields ->
(i % 10) + j |] |]
let categorizeFields = [| (1, 6); (2, 3); (2, 4); (3, 2) |]
let rules = [| for i in 1 .. numRules -> (i % numFields, i) |]
let assessments =
Array.Parallel.map (Assess categorizeFields rules) inputs
|> JoinAssessments
printfn "Assessments: %A" assessments
0
This is a version of your program that doesn't require mutability and uses nearly all of the 4 cpus on my iMac.
To pull it off, it's driven by assessing each rule in parallel, not by processing records. That also required the input array to be transposed making it be fields by records.
open System
type CategoryAssessment =
{ fieldIndex: int
value: int
ruleAssessments: list<int> }
let MatchRule rVal fVal =
rVal = fVal
let AssessRule cMatches (inputs:int[][]) (rIndex, rVal) =
// printfn "START - Assess" // uses more cpu than the code itself
let matches = inputs.[rIndex] |>
Array.map2 (fun cVal fVal -> (MatchRule rVal fVal) = cVal) cMatches
let assessment = matches |>
Array.map ( fun v -> if v then 1 else 0 ) |>
Array.sum
// printfn "END - Assess"
assessment
let Assess categorizeFields rules (inputs:int[][]) =
categorizeFields |> List.map (fun (catIndex, catValue) ->
let catMatches = inputs.[catIndex] |> Array.map( fun v -> v = catValue )
let assessments = rules |> Array.Parallel.map
(AssessRule catMatches inputs)
|> Array.toList
{ CategoryAssessment.fieldIndex = catIndex;
value = catValue;
ruleAssessments = assessments }
)
let numRecords = 10000
let numFields = 20
let numRules = 10000
let inputs = [| for j in 1 .. numFields ->
[| for i in 1 .. numRecords -> (i % 10) + j |] |]
let categorizeFields = [ (1, 6); (2, 3); (2, 4); (3, 2) ]
let rules = [| for i in 1 .. numRules -> (i % numFields, i) |]
let assessments = Assess categorizeFields rules inputs
printfn "Assessments: %A" assessments
Assessing by rule allowed the summing of a single integer across all records for a given rule, avoiding mutable state and extra memory allocations.
I used a lot of array iteration to get the speed up but didn't remove all the lists.
I fear I changed the functionality while refactoring or made assumptions that can't be applied to your actual problem, however I do hope it's a useful example.
Ok, this looks like it should be easy, but I'm just not getting it. If I have a sequence of numbers, how do I generate a new sequence made up of the running totals? eg for a sequence [1;2;3;4], I want to map it to [1;3;6;10]. In a suitably functional way.
Use List.scan:
let runningTotal = List.scan (+) 0 >> List.tail
[1; 2; 3; 4]
|> runningTotal
|> printfn "%A"
Seq.scan-based implementation:
let runningTotal seq' = (Seq.head seq', Seq.skip 1 seq') ||> Seq.scan (+)
{ 1..4 }
|> runningTotal
|> printfn "%A"
Another variation using Seq.scan (Seq.skip 1 gets rid of the leading zero):
> {1..4} |> Seq.scan (+) 0 |> Seq.skip 1;;
val it : seq<int> = seq [1; 3; 6; 10]
> Seq.scan (fun acc n -> acc + n) 0 [1;2;3;4];;
val it : seq<int> = seq [0; 1; 3; 6; ...]
With lists:
> [1;2;3;4] |> List.scan (fun acc n -> acc + n) 0 |> List.tail;;
val it : int list = [1; 3; 6; 10]
Edit: Another way with sequences:
let sum s = seq {
let x = ref 0
for i in s do
x := !x + i
yield !x
}
Yes, there's a mutable variable, but I find it more readable (if you want to get rid of the leading 0).
Figured it was worthwhile to share how to do this with Record Types in case that's also what you came here looking for.
Below is a fictitious example demonstrating the concept using runner laps around a track.
type Split = double
type Lap = { Num : int; Split : Split }
type RunnerLap = { Lap : Lap; TotalTime : double }
let lap1 = { Num = 1; Split = 1.23 }
let lap2 = { Num = 2; Split = 1.13 }
let lap3 = { Num = 3; Split = 1.03 }
let laps = [lap1;lap2;lap3]
let runnerLapsAccumulator =
Seq.scan
(fun rl l -> { rl with Lap = l; TotalTime = rl.TotalTime + l.Split }) // acumulator
{ Lap = { Num = 0; Split = 0.0 }; TotalTime = 0.0 } // initial state
let runnerLaps = laps |> runnerLapsAccumulator
printfn "%A" runnerLaps
Not sure this is the best way but it should do the trick
let input = [1; 2; 3; 4]
let runningTotal =
(input, 0)
|> Seq.unfold (fun (list, total) ->
match list with
| [] ->
None
| h::t ->
let total = total + h
total, (t, total) |> Some)
|> List.ofSeq
First, in order to provide full disclosure, I want to point out that this is related to homework in a Machine Learning class. This question is not the homework question and instead is something I need to figure out in order to complete the bigger problem of creating an ID3 Decision Tree Algorithm.
I need to generate tree similar to the following when given a truth table
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
learnedTree is of type BinaryTree which I've defined as follows:
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
ID3 algorithms take into account various equations to determine where to split the tree, and I've got all that figured out, I'm just having trouble creating the learned tree from my truth table. For example if I have the following table
A1 | A2 | A3 | Class
1 0 0 1
0 1 0 1
0 0 0 0
1 0 1 0
0 0 0 0
1 1 0 1
0 1 1 0
And I decide to split on attribute A1 I would end up with the following:
(A1 = 1) A1 (A1 = 0)
A2 | A3 | Class A2 | A3 | Class
0 0 1 1 0 1
0 1 0 0 0 0
1 0 1 0 0 0
0 1 1
Then I would split the left side and split the right side, and continue the recursive pattern until the leaf nodes are pure and I end up with a tree similar to the following based on the splitting.
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
Here is what I've kind of "hacked" together thus far, but I think I might be way off:
let rec createTree (listToSplit : list<list<float>>) index =
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 0. then Some(x) else None)
if leftSideSplit.Length > 0 then
let pureCheck = isListPure leftSideSplit
if pureCheck = 0 then
printfn "%s" "Pure left node class 0"
createTree leftSideSplit (index + 1)
else if pureCheck = 1 then
printfn "%s" "Pure left node class 1"
createTree leftSideSplit (index + 1)
else
printfn "%s - %A" "Recursing Left" leftSideSplit
createTree leftSideSplit (index + 1)
else printfn "%s" "Pure left node class 0"
Should I be using pattern matching instead? Any tips/ideas/help? Thanks a bunch!
Edit: I've since posted an implementation of ID3 on my blog at:
http://blogs.msdn.com/chrsmith
Hey Jim, I've been wanting to write a blog post implementing ID3 in F# for a while - thanks for giving me an execute. While this code doesn't implement the algorithm full (or correctly), it should be sufficient for getting you started.
In general you have the right approach - representing each branch as a discriminated union case is good. And like Brian said, List.partition is definitely a handy function. The trick to making this work correctly is all in determining the optimal attribute/value pair to split on - and to do that you'll need to calculate information gain via entropy, etc.
type Attribute = string
type Value = string
type Record =
{
Weather : string
Temperature : string
PlayTennis : bool
}
override this.ToString() =
sprintf
"{Weather = %s, Temp = %s, PlayTennis = %b}"
this.Weather
this.Temperature
this.PlayTennis
type Decision = Attribute * Value
type DecisionTreeNode =
| Branch of Decision * DecisionTreeNode * DecisionTreeNode
| Leaf of Record list
// ------------------------------------
// Splits a record list into an optimal split and the left / right branches.
// (This is where you use the entropy function to maxamize information gain.)
// Record list -> Decision * Record list * Record list
let bestSplit data =
// Just group by weather, then by temperature
let uniqueWeathers =
List.fold
(fun acc item -> Set.add item.Weather acc)
Set.empty
data
let uniqueTemperatures =
List.fold
(fun acc item -> Set.add item.Temperature acc)
Set.empty
data
if uniqueWeathers.Count = 1 then
let bestSplit = ("Temperature", uniqueTemperatures.MinimumElement)
let left, right =
List.partition
(fun item -> item.Temperature = uniqueTemperatures.MinimumElement)
data
(bestSplit, left, right)
else
let bestSplit = ("Weather", uniqueWeathers.MinimumElement)
let left, right =
List.partition
(fun item -> item.Weather = uniqueWeathers.MinimumElement)
data
(bestSplit, left, right)
let rec determineBranch data =
if List.length data < 4 then
Leaf(data)
else
// Use the entropy function to break the dataset on
// the category / value that best splits the data
let bestDecision, leftBranch, rightBranch = bestSplit data
Branch(
bestDecision,
determineBranch leftBranch,
determineBranch rightBranch)
// ------------------------------------
let rec printID3Result indent branch =
let padding = new System.String(' ', indent)
match branch with
| Leaf(data) ->
data |> List.iter (fun item -> printfn "%s%s" padding <| item.ToString())
| Branch(decision, lhs, rhs) ->
printfn "%sBranch predicate [%A]" padding decision
printfn "%sWhere predicate is true:" padding
printID3Result (indent + 4) lhs
printfn "%sWhere predicate is false:" padding
printID3Result (indent + 4) rhs
// ------------------------------------
let dataset =
[
{ Weather = "windy"; Temperature = "hot"; PlayTennis = false }
{ Weather = "windy"; Temperature = "cool"; PlayTennis = false }
{ Weather = "nice"; Temperature = "cool"; PlayTennis = true }
{ Weather = "nice"; Temperature = "cold"; PlayTennis = true }
{ Weather = "humid"; Temperature = "hot"; PlayTennis = false }
]
printfn "Given input list:"
dataset |> List.iter (printfn "%A")
printfn "ID3 split resulted in:"
let id3Result = determineBranch dataset
printID3Result 0 id3Result
You can use List.partition instead of your two List.choose calls.
http://research.microsoft.com/en-us/um/cambridge/projects/fsharp/manual/FSharp.Core/Microsoft.FSharp.Collections.List.html
(or now http://msdn.microsoft.com/en-us/library/ee353738(VS.100).aspx )
It isn't clear to me that pattern matching will buy you much here; the input type (list of lists) and processing (partitioning and 'pureness' check) doesn't really lend itself to that.
And of course when you finally get the 'end' (a pure list) you need to create a tree, and then presumably this function will create a Leaf when the input only has one 'side' and it's 'pure', but create a Node out of the left-side and right-side results for every other input. Maybe. I didn't quite grok the algorithm completely.
Hopefully that will help steer you a little bit. May be useful to draw up a few smaller sample inputs and outputs to help work out the various cases of the function body.
Thanks Brian & Chris! I was actually able to figure this out and I ended up with the following. This calculates the information gain for determining the best place to split. I'm sure there are probably better ways for me to arrive at this solution especially around the chosen data structures, but this is a start. I plan to refine things later.
#light
open System
let trainList =
[
[1.;0.;0.;1.;];
[0.;1.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;1.;0.;];
[0.;0.;0.;0.;];
[1.;1.;0.;1.;];
[0.;1.;1.;0.;];
[1.;0.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;0.;1.;];
]
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
let entropyList nums =
let sumOfnums =
nums
|> Seq.sum
nums
|> Seq.map (fun x -> if x=0.00 then x else (-((x/sumOfnums) * Math.Log(x/sumOfnums, 2.))))
|> Seq.sum
let entropyBinaryList (dataListOfLists:list<list<float>>) =
let classList =
dataListOfLists
|> List.map (fun x -> x.Item(x.Length - 1))
let ListOfNo =
classList
|> List.choose (fun x -> if x = 0. then Some(x) else None)
let ListOfYes =
classList
|> List.choose (fun x -> if x = 1. then Some(x) else None)
let numberOfYes : float = float ListOfYes.Length
let numberOfNo : float = float ListOfNo.Length
let ListOfNumYesAndSumNo = [numberOfYes; numberOfNo]
entropyList ListOfNumYesAndSumNo
let conditionalEntropy (dataListOfLists:list<list<float>>) attributeNumber =
let NoAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 0. then Some(x) else None)
let YesAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 1. then Some(x) else None)
let numberOfYes : float = float YesAttributeList.Length
let numberOfNo : float = float NoAttributeList.Length
let noConditionalEntropy = (entropyBinaryList NoAttributeList) * (numberOfNo/(numberOfNo + numberOfYes))
let yesConditionalEntropy = (entropyBinaryList YesAttributeList) * (numberOfYes/(numberOfNo + numberOfYes))
[noConditionalEntropy; yesConditionalEntropy]
let findBestSplitIndex(listOfInstances : list<list<float>>) =
let IGList =
[0..(listOfInstances.Item(0).Length - 2)]
|> List.mapi (fun i x -> (i, (entropyBinaryList listOfInstances) - (List.sum (conditionalEntropy listOfInstances x))))
IGList
|> List.maxBy snd
|> fst
let isListPure (listToCheck : list<list<float>>) =
let splitList = listToCheck |> List.choose (fun x -> if x.Item(x.Length - 1) = 1. then Some(x) else None)
if splitList.Length = listToCheck.Length then 1
else if splitList.Length = 0 then 0
else -1
let rec createTree (listToSplit : list<list<float>>) =
let pureCheck = isListPure listToSplit
if pureCheck = 0 then
printfn "%s" "Pure - Leaf(0)"
else if pureCheck = 1 then
printfn "%s" "Pure - Leaf(1)"
else
printfn "%A - is not pure" listToSplit
if listToSplit.Length > 1 then // There are attributes we can split on
// Chose best place to split list
let splitIndex = findBestSplitIndex(listToSplit)
printfn "spliting at index %A" splitIndex
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 0. then Some(x) else None)
createTree leftSideSplit
createTree rightSideSplit
else
printfn "%s" "Not Pure, but can't split choose based on heuristics - Leaf(0 or 1)"