I have the following F# function which makes use of a ref variable to seed and keep track of a running total, something tells me this isn't in the spirit of fp or even particular clear on its own. I'd like some direction on the clearest (possible fp, but if an imperative approach is clearer I'd be open to that) way to express this in F#. Note that selectItem implements a random weighted selection algorithm.
type WeightedItem(id: int, weight: int) =
member self.id = id
member self.weight = weight
let selectItem (items: WeightedItem list) (rand:System.Random) =
let totalWeight = List.sumBy (fun (item: WeightedItem) -> item.weight) items
let selection = rand.Next(totalWeight) + 1
let runningWeight = ref 0
List.find
(fun (item: WeightedItem) ->
runningWeight := !runningWeight + item.weight
!runningWeight >= selection)
items
let items = [new WeightedItem(1,100); new WeightedItem(2,50); new WeightedItem(3,25)]
let selection = selectItem items (new System.Random())
Here is a version of the search algorithm using a recursive function. My F# is very rusty and I don't know what to return when we can't find anything:
let rec find list item total =
match list with
| h::t -> if h > total then h else find t item total+h
| [] -> 0 //<-- return some sort of default to say can't find the item
EDIT
Full code:
type WeightedItem(id: int, weight: int) =
member self.id = id
member self.weight = weight
let selectItem (items: WeightedItem list) (rand:System.Random) =
let totalWeight = List.sumBy (fun (item: WeightedItem) -> item.weight) items
let selection = rand.Next(totalWeight) + 1
let rec find runningWeight ((h:WeightedItem)::t) =
let newRunningWeight = runningWeight + h.weight
if newRunningWeight >= selection then
h
else
find newRunningWeight t
find 0 items
let items = [new WeightedItem(1,100)
new WeightedItem(2,50)
new WeightedItem(3,25)]
let selection = selectItem items (new System.Random())
Hm, here's one with Seq.scan, but it also feels very ugly...
type WeightedItem(id: int, weight: int) =
member self.id = id
member self.weight = weight
let selectItem (items: WeightedItem list) (rand:System.Random) =
let totalWeight = List.sumBy (fun (item: WeightedItem) -> item.weight) items
let selection = rand.Next(totalWeight) + 1
Seq.scan
(fun (runningWeight,found,itemO) (item: WeightedItem) ->
if not found then
let newRunningWeight = runningWeight + item.weight
newRunningWeight, newRunningWeight >= selection, Some(item)
else
(runningWeight,found,itemO))
(0,false,None)
items
|> Seq.find (fun (rw,f,i) -> f)
|> (fun (rw,f,i) -> i.Value)
let items = [new WeightedItem(1,100)
new WeightedItem(2,50)
new WeightedItem(3,25)]
let selection = selectItem items (new System.Random())
Igor's answer is probably the best one for items stored in a list in terms of efficiency, but since Brian's scan approach is representative of a recurrent sequence manipulation pattern, I suggest a slightly more compact variation :
let selectItem (items: WeightedItem list) (rand:System.Random) =
let totalWeight = List.sumBy (fun (item: WeightedItem) -> item.weight) items
let selection = rand.Next(totalWeight) + 1
items
|> Seq.scan (fun acc (item : WeightedItem) -> acc + item.weight) 0
|> Seq.skip 1 |> Seq.zip items
|> Seq.find (fun (i, rw) -> rw >= selection) |> fst
Use Seq.unfold to build an on-demand sequence that accumulates runningWeight and then search it for the first element that had a sufficiently large runningWeight using Seq.pick:
let gen = function
| _, [] -> None
| runningWeight, item::items ->
let runningWeight = runningWeight + item.weight
Some((if runningWeight >= selection then Some item else None), (runningWeight, items))
Seq.unfold gen (0, xs) |> Seq.pick id
Hm, here's one way to do it with a fold, but it feels inelegant and always traverses the whole list...
type WeightedItem(id: int, weight: int) =
member self.id = id
member self.weight = weight
let selectItem (items: WeightedItem list) (rand:System.Random) =
let totalWeight = List.sumBy (fun (item: WeightedItem) -> item.weight) items
let selection = rand.Next(totalWeight) + 1
List.fold
(fun (runningWeight,found) (item: WeightedItem) ->
if not found then
let newRunningWeight = runningWeight + item.weight
newRunningWeight, newRunningWeight >= selection
else
(runningWeight,found))
(0,false)
items
|> fst
let items = [new WeightedItem(1,100)
new WeightedItem(2,50)
new WeightedItem(3,25)]
let selection = selectItem items (new System.Random())
Hm, here's some mutables and a loop; still traverses the whole list though...
type WeightedItem(id: int, weight: int) =
member self.id = id
member self.weight = weight
let selectItem (items: WeightedItem list) (rand:System.Random) =
let totalWeight = List.sumBy (fun (item: WeightedItem) -> item.weight) items
let selection = rand.Next(totalWeight) + 1
let mutable runningWeight = 0
let mutable found = None
for item in items do
match found with
| None ->
runningWeight <- runningWeight + item.weight
if runningWeight >= selection then
found <- Some(item)
| _ -> ()
found.Value
let items = [new WeightedItem(1,100)
new WeightedItem(2,50)
new WeightedItem(3,25)]
let selection = selectItem items (new System.Random())
This is my favorite of the three. I look forward to the day that F# adds break. Of course you can call GetEnumerator and take full control, but that is ugly too.
Related
I have been doing a CodeWars exercise which can also be seen at dev.to.
The essence of it is:
There is a line for the self-checkout machines at the supermarket. Your challenge is to write a function that calculates the total amount of time required for the rest of the customers to check out!
INPUT
customers : an array of positive integers representing the line. Each integer represents a customer, and its value is the amount of time they require to check out.
n : a positive integer, the number of checkout tills.
RULES
There is only one line serving many machines, and
The order of the line never changes, and
The front person in the line (i.e. the first element in the array/list) proceeds to a machine as soon as it becomes free.
OUTPUT
The function should return an integer, the total time required.
The answer I came up with works - but it is highly imperative.
open System.Collections.Generic
open System.Linq
let getQueueTime (customerArray: int list) n =
let mutable d = new Dictionary<string,int>()
for i in 1..n do
d.Add(sprintf "Line%d" <| i, 0)
let getNextAvailableSupermarketLineName(d:Dictionary<string,int>) =
let mutable lowestValue = -1
let mutable lineName = ""
for myLineName in d.Keys do
let myValue = d.Item(myLineName)
if lowestValue = -1 || myValue <= lowestValue then
lowestValue <- myValue
lineName <- myLineName
lineName
for x in customerArray do
let lineName = getNextAvailableSupermarketLineName d
let lineTotal = d.Item(lineName)
d.Item(lineName) <- lineTotal + x
d.Values.Max()
So my question is ... is this OK F# code or should it be written in a functional way? And if the latter, how? (I started off trying to do it functionally but didn't get anywhere).
is this OK F# code or should it be written in a functional way?
That's a subjective question, so can't be answered. I'm assuming, however, that since you're doing an exercise, it's in order to learn. Learning functional programming takes years for most people (it did for me), but F# is a great language because it enables you learn gradually.
You can, however, simplify the algorithm. Think of a till as a number. The number represents the instant it's ready. At the beginning, you initialise them all to 0:
let tills = List.replicate n 0
where n is the number of tills. At the beginning, they're all ready at time 0. If, for example, n is 3, the tills are:
> List.replicate 3 0;;
val it : int list = [0; 0; 0]
Now you consider the next customer in the line. For each customer, you have to pick a till. You pick the one that is available first, i.e. with the lowest number. Then you need to 'update' the list of counters.
In order to do that, you'll need a function to 'update' a list at a particular index, which isn't part of the base library. You can define it yourself, however:
module List =
let set idx v = List.mapi (fun i x -> if i = idx then v else x)
For example, if you want to 'update' the second element to 3, you can do it like this:
> List.replicate 3 0 |> List.set 1 3;;
val it : int list = [0; 3; 0]
Now you can write a function that updates the set of tills given their current state and a customer (represented by a duration, which is also a number).
let next tills customer =
let earliestTime = List.min tills
let idx = List.findIndex (fun c -> earliestTime = c) tills
List.set idx (earliestTime + customer) tills
First, the next function finds the earliestTime in tills by using List.min. Then it finds the index of that value. Finally, it 'updates' that till by adding its current state to the customer duration.
Imagine that you have two tills and the customers [2;3;10]:
> List.replicate 2 0;;
val it : int list = [0; 0]
> List.replicate 2 0 |> fun tills -> next tills 2;;
val it : int list = [2; 0]
> List.replicate 2 0 |> fun tills -> next tills 2 |> fun tills -> next tills 3;;
val it : int list = [2; 3]
> List.replicate 2 0 |> fun tills -> next tills 2 |> fun tills -> next tills 3
|> fun tills -> next tills 10;;
val it : int list = [12; 3]
You'll notice that you can keep calling the next function for all the customers in the line. That's called a fold. This gives you the final state of the tills. The final step is to return the value of the till with the highest value, because that represents the time it finished. The overall function, then, is:
let queueTime line n =
let next tills customer =
let earliestTime = List.min tills
let idx = List.findIndex (fun c -> earliestTime = c) tills
List.set idx (earliestTime + customer) tills
let tills = List.replicate n 0
let finalState = List.fold next tills line
List.max finalState
Here's some examples, taken from the original exercise:
> queueTime [5;3;4] 1;;
val it : int = 12
> queueTime [10;2;3;3] 2;;
val it : int = 10
> queueTime [2;3;10] 2;;
val it : int = 12
This solution is based entirely on immutable data, and all functions are pure, so that's a functional solution.
Here is a version that resembles your version, with all the mutability removed:
let getQueueTime (customerArray: int list) n =
let updateWith f key map =
let v = Map.find key map
map |> Map.add key (f v)
let initialLines = [1..n] |> List.map (fun i -> sprintf "Line%d" i, 0) |> Map.ofList
let getNextAvailableSupermarketLineName(d:Map<string,int>) =
let lowestLine = d |> Seq.minBy (fun l -> l.Value)
lowestLine.Key
let lines =
customerArray
|> List.fold (fun linesState x ->
let lineName = getNextAvailableSupermarketLineName linesState
linesState |> updateWith (fun l -> l + x) lineName) initialLines
lines |> Seq.map (fun l -> l.Value) |> Seq.max
getQueueTime [5;3;4] 1 |> printfn "%i"
Those loops with mutable "outer state" can be swapped for either recursive functions or folds/reduce, here I suspect recursive functions would be nicer.
I've swapped out Dictionary for the immutable Map, but it feels like more trouble than it's worth here.
Update - here is a compromise solution I think reads well:
let getQueueTime (customerArray: int list) n =
let d = [1..n] |> List.map (fun i -> sprintf "Line%d" i, 0) |> dict
let getNextAvailableSupermarketLineName(d:IDictionary<string,int>) =
let lowestLine = d |> Seq.minBy (fun l -> l.Value)
lowestLine.Key
customerArray
|> List.iter (fun x ->
let lineName = getNextAvailableSupermarketLineName d
d.Item(lineName) <- d.Item(lineName) + 1)
d.Values |> Seq.max
getQueueTime [5;3;4] 1 |> printfn "%i"
I believe there is a more natural functional solution if you approach it freshly, but I wanted to evolve your current solution.
This is less an attempt at answering than an extended comment on Mark Seemann's otherwise excellent answer. If we do not restrict ourselves to standard library functions, the slightly cumbersome determination of the index with List.findIndex can be avoided. Instead, we may devise a function that replaces the first occurrence of a value in a list with a new value.
The implementation of our bespoke List.replace involves recursion, with an accumulator to hold the values before we encounter the first occurrence. When found, the accumulator needs to be reversed and also to have the new value and the tail of the original list appended. Both of this can be done in one operation: List.fold being fed the new value and tail of the original list as initial state while the elements of the accumulator are prepended in the loop, thereby restoring their order.
module List =
// Replace the first occurrence of a specific object in a list
let replace oldValue newValue source =
let rec aux acc = function
| [] -> List.rev acc
| x::xs when x = oldValue ->
(newValue::xs, acc)
||> List.fold (fun xs x -> x::xs)
| x::xs -> aux (x::acc) xs
aux [] source
let queueTime customers n =
(List.init n (fun _ -> 0), customers)
||> List.fold (fun xs customer ->
let x = List.min xs
List.replace x (x + customer) xs )
|> List.max
queueTime [5;3;4] 1 // val it : int = 12
queueTime [10;2;3;3] 2 // val it : int = 10
queueTime [2;3;10] 2 // val it : int = 12
how do you convert an obj list to int type. I am trying to add two lists using a map function below but it doesn't work on obj lists.
let query f=
seq{
let cmd = new OleDbCommand( "SELECT * FROM F" );
let conn = new OleDbConnection( #"Provider=Microsoft.ACE.OLEDB.12.0;
Data Source=D:\Users\df\Documents\Vfolio.accdb;
Persist Security Info=False;" )
conn.Open()
let DAdapt = new OleDbDataAdapter("SELECT * FROM F",conn)
let DTab = new DataSet()
let i= DAdapt.Fill(DTab)
let rowCol = DTab.Tables.[0].Rows
let rowCount = rowCol.Count
for i in 0 .. (rowCount - 1) do
yield f (rowCol.[i])
}
let u= query(fun row -> row.[0])
let a= List.ofSeq u
let v=query(fun row -> row.[1])
let b= List.ofSeq v
let c = List.map2 (fun x y-> x + y) a b
error msg: The type 'obj' does not support the operator '+'
Because row.[i] returns type obj, your u and v become seq<obj>, and thus your a and b become type List<obj>, and therefore x and y are inferred to have type obj, and of course, you can't add two objs, which is exactly what the compiler tells you.
If you are sure that row.[0] and row.[1] are numbers of some kind, you should apply the appropriate cast, for example:
let u= query(fun row -> row.[0] :?> int)
let a= List.ofSeq u
let v=query(fun row -> row.[1] :?> int)
let b= List.ofSeq v
let c = List.map2 (fun x y-> x + y) a b
You can apply this cast in other places, too, depending on your taste and requirements, for example:
let c = List.map2 (fun x y-> (x :?> int) + (y :?> int)) a b
Or:
let a= u |> Seq.cast<int> |> List.ofSeq
let b= v |> Seq.cast<int> |> List.ofSeq
But I like the first example best, because it applies the cast at the earliest known point and results in the least amount of extra code.
But beware: if row.[0] turns out to be not an int at runtime, you will get an InvalidCastException.
P.S. In your List.map2 call, you could specify (+) directly instead of wrapping it in an extra lambda:
List.map2 (+) a b
P.P.S Also, it seems that your List.ofSeq calls are wasteful, for Seq also has a map2:
let u = query(fun row -> row.[0] :?> int)
let v = query(fun row -> row.[1] :?> int)
let c = Seq.map2 (+) u v |> List.ofSeq
P.P.P.S Also, have you noticed that each of the two calls to query generates its own DB connection, command, adapter, and dataset? Did you intend this or did you mean to only have one connection and then fetch different columns from the result? If so, you should only call query once:
let c = query( fun row -> (row.[0] :?> int) + (row.[1] :?> int) ) |> List.ofSeq
I am seeking help, mainly because I am very new to F# environment. I need to use F# stream to generate an infinite stream of Armstrong Numbers. Can any one help with this one. I have done some mambo jumbo but I have no clue where I'm going.
type 'a stream = | Cons of 'a * (unit -> 'a stream)
let rec take n (Cons(x, xsf)) =
if n = 0 then []
else x :: take (n-1) (xsf());;
//to test if two integers are equal
let test x y =
match (x,y) with
| (x,y) when x < y -> false
| (x,y) when x > y -> false
| _ -> true
//to check for armstrong number
let check n =
let mutable m = n
let mutable r = 0
let mutable s = 0
while m <> 0 do
r <- m%10
s <- s+r*r*r
m <- m/10
if (test n s) then true else false
let rec armstrong n =
Cons (n, fun () -> if check (n+1) then armstrong (n+1) else armstrong (n+2))
let pos = armstrong 0
take 5 pos
To be honest your code seems a bit like a mess.
The most basic version I could think of is this:
let isArmstrong (a,b,c) =
a*a*a + b*b*b + c*c*c = (a*100+b*10+c)
let armstrongs =
seq {
for a in [0..9] do
for b in [0..9] do
for c in [0..9] do
if isArmstrong (a,b,c) then yield (a*100+b*10+c)
}
of course assuming a armstrong number is a 3-digit number where the sum of the cubes of the digits is the number itself
this will yield you:
> Seq.toList armstrongs;;
val it : int list = [0; 1; 153; 370; 371; 407]
but it should be easy to add a wider range or remove the one-digit numbers (think about it).
general case
the problem seems so interesting that I choose to implement the general case (see here) too:
let numbers =
let rec create n =
if n = 0 then [(0,[])] else
[
for x in [0..9] do
for (_,xs) in create (n-1) do
yield (n, x::xs)
]
Seq.initInfinite create |> Seq.concat
let toNumber (ds : int list) =
ds |> List.fold (fun s d -> s*10I + bigint d) 0I
let armstrong (m : int, ds : int list) =
ds |> List.map (fun d -> bigint d ** m) |> List.sum
let leadingZero =
function
| 0::_ -> true
| _ -> false
let isArmstrong (m : int, ds : int list) =
if leadingZero ds then false else
let left = armstrong (m, ds)
let right = toNumber ds
left = right
let armstrongs =
numbers
|> Seq.filter isArmstrong
|> Seq.map (snd >> toNumber)
but the numbers get really sparse quickly and using this will soon get you out-of-memory but the
first 20 are:
> Seq.take 20 armstrongs |> Seq.map string |> Seq.toList;;
val it : string list =
["0"; "1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "153"; "370"; "371";
"407"; "1634"; "8208"; "9474"; "54748"; "92727"; "93084"]
remark/disclaimer
this is the most basic version - you can get big speed/performance if you just enumerate all numbers and use basic math to get and exponentiate the digits ;) ... sure you can figure it out
I am using Array.Parallel.map on a function but find that it is not executing at anywhere near full processor capacity. I am assuming this is because the function creates a lot of objects when running List.map and List.map2. Would this be causing a synchronization issue and is there a more appropriate way of doing this? At the moment the only way I can think of getting around this is by running each process as a separate executable using something like xargs under Linux.
I put together the script below to demonstrate the problem. It is a very basic data categorizer which relies on a field having a certain value as a rule to determine if this will predict a category:
open System
type CategoryAssessment =
{ fieldIndex: int
value: int
ruleAssessments: list<int> }
let InitAssessment categorizeFields rules =
let ruleAssessments = List.init (List.length rules) (fun x -> 0)
List.map (fun categorizeField ->
let fieldIndex, categoryValue = categorizeField
{ CategoryAssessment.fieldIndex = fieldIndex;
value = categoryValue;
ruleAssessments = ruleAssessments })
categorizeFields
let AssessCategory ruleMatches (row : int[]) categoryAssessment =
let fieldIndex = categoryAssessment.fieldIndex
let categoryValue = categoryAssessment.value
let categoryMatch = categoryValue = row.[fieldIndex]
let newRuleAssessments =
List.map2 (fun ruleAssessment ruleMatch ->
if ruleMatch = categoryMatch then
ruleAssessment + 1
else
ruleAssessment)
categoryAssessment.ruleAssessments
ruleMatches
{ categoryAssessment with ruleAssessments = newRuleAssessments }
let MatchRule (row : int[]) rule =
let fieldIndex, eqVal = rule
row.[fieldIndex] = eqVal
let Assess categorizeFields rules input =
printfn "START - Assess"
let d =
Array.fold (fun categoryAssessment row ->
let ruleMatches = List.map (MatchRule row) rules
List.map (AssessCategory ruleMatches row) categoryAssessment)
(InitAssessment categorizeFields rules)
input
printfn "END - Assess"
d
let JoinAssessments assessments =
let numAssessments = Array.length assessments
Array.fold (fun accAssessment assessment ->
List.map2 (fun accCategory category ->
let newRuleAssessments =
List.map2 (+)
accCategory.ruleAssessments
category.ruleAssessments
{ accCategory with
ruleAssessments = newRuleAssessments })
accAssessment
assessment)
assessments.[0]
assessments.[1..(numAssessments-1)]
let numRecords = 10000
let numFields = 20
let numSplits = 10
let numRules = 10000
let inputs = Array.create numSplits
[| for i in 1 .. (numRecords / numSplits) ->
[| for j in 1 .. numFields ->
(i % 10) + j |] |]
let categorizeFields = [ (1, 6); (2, 3); (2, 4); (3, 2) ]
let rules = [ for i in 1 .. numRules -> (i % numFields, i) ]
let assessments =
Array.Parallel.map (Assess categorizeFields rules) inputs
|> JoinAssessments
printfn "Assessments: %A" assessments
0
After a fair bit of investigation, the ultimate answer to my question seems to be to find a way of not creating lots of objects. The easiest change to do this is moving to using arrays instead of lists. I have written up my findings more fully in an article: Beware of Immutable Lists for F# Parallel Processing.
The above program when altered as follows, runs better between threads and runs much quicker even on a single thread. Further improvements can be made by making the ruleAssessments field mutable as demonstrated in the referenced article.
open System
type CategoryAssessment =
{ fieldIndex: int
value: int
ruleAssessments: int[] }
let InitAssessment categorizeFields rules =
let ruleAssessments = Array.create (Array.length rules) 0
Array.map (fun categorizeField ->
let fieldIndex, categoryValue = categorizeField
{ CategoryAssessment.fieldIndex = fieldIndex;
value = categoryValue;
ruleAssessments = ruleAssessments })
categorizeFields
let AssessCategory ruleMatches (row : int[]) categoryAssessment =
let fieldIndex = categoryAssessment.fieldIndex
let categoryValue = categoryAssessment.value
let categoryMatch = categoryValue = row.[fieldIndex]
let newRuleAssessments =
Array.map2 (fun ruleAssessment ruleMatch ->
if ruleMatch = categoryMatch then
ruleAssessment + 1
else
ruleAssessment)
categoryAssessment.ruleAssessments
ruleMatches
{ categoryAssessment with ruleAssessments = newRuleAssessments }
let MatchRule (row : int[]) rule =
let fieldIndex, eqVal = rule
row.[fieldIndex] = eqVal
let Assess categorizeFields rules input =
printfn "START - Assess"
let d =
Array.fold (fun categoryAssessment row ->
let ruleMatches = Array.map (MatchRule row) rules
Array.map (AssessCategory ruleMatches row) categoryAssessment)
(InitAssessment categorizeFields rules)
input
printfn "END - Assess"
d
let JoinAssessments assessments =
let numAssessments = Array.length assessments
Array.fold (fun accAssessment assessment ->
Array.map2 (fun accCategory category ->
let newRuleAssessments =
Array.map2 (+)
accCategory.ruleAssessments
category.ruleAssessments
{ accCategory with
ruleAssessments = newRuleAssessments })
accAssessment
assessment)
assessments.[0]
assessments.[1..(numAssessments-1)]
let numRecords = 10000
let numFields = 20
let numSplits = 10
let numRules = 10000
let inputs = Array.create numSplits
[| for i in 1 .. (numRecords / numSplits) ->
[| for j in 1 .. numFields ->
(i % 10) + j |] |]
let categorizeFields = [| (1, 6); (2, 3); (2, 4); (3, 2) |]
let rules = [| for i in 1 .. numRules -> (i % numFields, i) |]
let assessments =
Array.Parallel.map (Assess categorizeFields rules) inputs
|> JoinAssessments
printfn "Assessments: %A" assessments
0
This is a version of your program that doesn't require mutability and uses nearly all of the 4 cpus on my iMac.
To pull it off, it's driven by assessing each rule in parallel, not by processing records. That also required the input array to be transposed making it be fields by records.
open System
type CategoryAssessment =
{ fieldIndex: int
value: int
ruleAssessments: list<int> }
let MatchRule rVal fVal =
rVal = fVal
let AssessRule cMatches (inputs:int[][]) (rIndex, rVal) =
// printfn "START - Assess" // uses more cpu than the code itself
let matches = inputs.[rIndex] |>
Array.map2 (fun cVal fVal -> (MatchRule rVal fVal) = cVal) cMatches
let assessment = matches |>
Array.map ( fun v -> if v then 1 else 0 ) |>
Array.sum
// printfn "END - Assess"
assessment
let Assess categorizeFields rules (inputs:int[][]) =
categorizeFields |> List.map (fun (catIndex, catValue) ->
let catMatches = inputs.[catIndex] |> Array.map( fun v -> v = catValue )
let assessments = rules |> Array.Parallel.map
(AssessRule catMatches inputs)
|> Array.toList
{ CategoryAssessment.fieldIndex = catIndex;
value = catValue;
ruleAssessments = assessments }
)
let numRecords = 10000
let numFields = 20
let numRules = 10000
let inputs = [| for j in 1 .. numFields ->
[| for i in 1 .. numRecords -> (i % 10) + j |] |]
let categorizeFields = [ (1, 6); (2, 3); (2, 4); (3, 2) ]
let rules = [| for i in 1 .. numRules -> (i % numFields, i) |]
let assessments = Assess categorizeFields rules inputs
printfn "Assessments: %A" assessments
Assessing by rule allowed the summing of a single integer across all records for a given rule, avoiding mutable state and extra memory allocations.
I used a lot of array iteration to get the speed up but didn't remove all the lists.
I fear I changed the functionality while refactoring or made assumptions that can't be applied to your actual problem, however I do hope it's a useful example.
First, in order to provide full disclosure, I want to point out that this is related to homework in a Machine Learning class. This question is not the homework question and instead is something I need to figure out in order to complete the bigger problem of creating an ID3 Decision Tree Algorithm.
I need to generate tree similar to the following when given a truth table
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
learnedTree is of type BinaryTree which I've defined as follows:
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
ID3 algorithms take into account various equations to determine where to split the tree, and I've got all that figured out, I'm just having trouble creating the learned tree from my truth table. For example if I have the following table
A1 | A2 | A3 | Class
1 0 0 1
0 1 0 1
0 0 0 0
1 0 1 0
0 0 0 0
1 1 0 1
0 1 1 0
And I decide to split on attribute A1 I would end up with the following:
(A1 = 1) A1 (A1 = 0)
A2 | A3 | Class A2 | A3 | Class
0 0 1 1 0 1
0 1 0 0 0 0
1 0 1 0 0 0
0 1 1
Then I would split the left side and split the right side, and continue the recursive pattern until the leaf nodes are pure and I end up with a tree similar to the following based on the splitting.
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
Here is what I've kind of "hacked" together thus far, but I think I might be way off:
let rec createTree (listToSplit : list<list<float>>) index =
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 0. then Some(x) else None)
if leftSideSplit.Length > 0 then
let pureCheck = isListPure leftSideSplit
if pureCheck = 0 then
printfn "%s" "Pure left node class 0"
createTree leftSideSplit (index + 1)
else if pureCheck = 1 then
printfn "%s" "Pure left node class 1"
createTree leftSideSplit (index + 1)
else
printfn "%s - %A" "Recursing Left" leftSideSplit
createTree leftSideSplit (index + 1)
else printfn "%s" "Pure left node class 0"
Should I be using pattern matching instead? Any tips/ideas/help? Thanks a bunch!
Edit: I've since posted an implementation of ID3 on my blog at:
http://blogs.msdn.com/chrsmith
Hey Jim, I've been wanting to write a blog post implementing ID3 in F# for a while - thanks for giving me an execute. While this code doesn't implement the algorithm full (or correctly), it should be sufficient for getting you started.
In general you have the right approach - representing each branch as a discriminated union case is good. And like Brian said, List.partition is definitely a handy function. The trick to making this work correctly is all in determining the optimal attribute/value pair to split on - and to do that you'll need to calculate information gain via entropy, etc.
type Attribute = string
type Value = string
type Record =
{
Weather : string
Temperature : string
PlayTennis : bool
}
override this.ToString() =
sprintf
"{Weather = %s, Temp = %s, PlayTennis = %b}"
this.Weather
this.Temperature
this.PlayTennis
type Decision = Attribute * Value
type DecisionTreeNode =
| Branch of Decision * DecisionTreeNode * DecisionTreeNode
| Leaf of Record list
// ------------------------------------
// Splits a record list into an optimal split and the left / right branches.
// (This is where you use the entropy function to maxamize information gain.)
// Record list -> Decision * Record list * Record list
let bestSplit data =
// Just group by weather, then by temperature
let uniqueWeathers =
List.fold
(fun acc item -> Set.add item.Weather acc)
Set.empty
data
let uniqueTemperatures =
List.fold
(fun acc item -> Set.add item.Temperature acc)
Set.empty
data
if uniqueWeathers.Count = 1 then
let bestSplit = ("Temperature", uniqueTemperatures.MinimumElement)
let left, right =
List.partition
(fun item -> item.Temperature = uniqueTemperatures.MinimumElement)
data
(bestSplit, left, right)
else
let bestSplit = ("Weather", uniqueWeathers.MinimumElement)
let left, right =
List.partition
(fun item -> item.Weather = uniqueWeathers.MinimumElement)
data
(bestSplit, left, right)
let rec determineBranch data =
if List.length data < 4 then
Leaf(data)
else
// Use the entropy function to break the dataset on
// the category / value that best splits the data
let bestDecision, leftBranch, rightBranch = bestSplit data
Branch(
bestDecision,
determineBranch leftBranch,
determineBranch rightBranch)
// ------------------------------------
let rec printID3Result indent branch =
let padding = new System.String(' ', indent)
match branch with
| Leaf(data) ->
data |> List.iter (fun item -> printfn "%s%s" padding <| item.ToString())
| Branch(decision, lhs, rhs) ->
printfn "%sBranch predicate [%A]" padding decision
printfn "%sWhere predicate is true:" padding
printID3Result (indent + 4) lhs
printfn "%sWhere predicate is false:" padding
printID3Result (indent + 4) rhs
// ------------------------------------
let dataset =
[
{ Weather = "windy"; Temperature = "hot"; PlayTennis = false }
{ Weather = "windy"; Temperature = "cool"; PlayTennis = false }
{ Weather = "nice"; Temperature = "cool"; PlayTennis = true }
{ Weather = "nice"; Temperature = "cold"; PlayTennis = true }
{ Weather = "humid"; Temperature = "hot"; PlayTennis = false }
]
printfn "Given input list:"
dataset |> List.iter (printfn "%A")
printfn "ID3 split resulted in:"
let id3Result = determineBranch dataset
printID3Result 0 id3Result
You can use List.partition instead of your two List.choose calls.
http://research.microsoft.com/en-us/um/cambridge/projects/fsharp/manual/FSharp.Core/Microsoft.FSharp.Collections.List.html
(or now http://msdn.microsoft.com/en-us/library/ee353738(VS.100).aspx )
It isn't clear to me that pattern matching will buy you much here; the input type (list of lists) and processing (partitioning and 'pureness' check) doesn't really lend itself to that.
And of course when you finally get the 'end' (a pure list) you need to create a tree, and then presumably this function will create a Leaf when the input only has one 'side' and it's 'pure', but create a Node out of the left-side and right-side results for every other input. Maybe. I didn't quite grok the algorithm completely.
Hopefully that will help steer you a little bit. May be useful to draw up a few smaller sample inputs and outputs to help work out the various cases of the function body.
Thanks Brian & Chris! I was actually able to figure this out and I ended up with the following. This calculates the information gain for determining the best place to split. I'm sure there are probably better ways for me to arrive at this solution especially around the chosen data structures, but this is a start. I plan to refine things later.
#light
open System
let trainList =
[
[1.;0.;0.;1.;];
[0.;1.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;1.;0.;];
[0.;0.;0.;0.;];
[1.;1.;0.;1.;];
[0.;1.;1.;0.;];
[1.;0.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;0.;1.;];
]
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
let entropyList nums =
let sumOfnums =
nums
|> Seq.sum
nums
|> Seq.map (fun x -> if x=0.00 then x else (-((x/sumOfnums) * Math.Log(x/sumOfnums, 2.))))
|> Seq.sum
let entropyBinaryList (dataListOfLists:list<list<float>>) =
let classList =
dataListOfLists
|> List.map (fun x -> x.Item(x.Length - 1))
let ListOfNo =
classList
|> List.choose (fun x -> if x = 0. then Some(x) else None)
let ListOfYes =
classList
|> List.choose (fun x -> if x = 1. then Some(x) else None)
let numberOfYes : float = float ListOfYes.Length
let numberOfNo : float = float ListOfNo.Length
let ListOfNumYesAndSumNo = [numberOfYes; numberOfNo]
entropyList ListOfNumYesAndSumNo
let conditionalEntropy (dataListOfLists:list<list<float>>) attributeNumber =
let NoAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 0. then Some(x) else None)
let YesAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 1. then Some(x) else None)
let numberOfYes : float = float YesAttributeList.Length
let numberOfNo : float = float NoAttributeList.Length
let noConditionalEntropy = (entropyBinaryList NoAttributeList) * (numberOfNo/(numberOfNo + numberOfYes))
let yesConditionalEntropy = (entropyBinaryList YesAttributeList) * (numberOfYes/(numberOfNo + numberOfYes))
[noConditionalEntropy; yesConditionalEntropy]
let findBestSplitIndex(listOfInstances : list<list<float>>) =
let IGList =
[0..(listOfInstances.Item(0).Length - 2)]
|> List.mapi (fun i x -> (i, (entropyBinaryList listOfInstances) - (List.sum (conditionalEntropy listOfInstances x))))
IGList
|> List.maxBy snd
|> fst
let isListPure (listToCheck : list<list<float>>) =
let splitList = listToCheck |> List.choose (fun x -> if x.Item(x.Length - 1) = 1. then Some(x) else None)
if splitList.Length = listToCheck.Length then 1
else if splitList.Length = 0 then 0
else -1
let rec createTree (listToSplit : list<list<float>>) =
let pureCheck = isListPure listToSplit
if pureCheck = 0 then
printfn "%s" "Pure - Leaf(0)"
else if pureCheck = 1 then
printfn "%s" "Pure - Leaf(1)"
else
printfn "%A - is not pure" listToSplit
if listToSplit.Length > 1 then // There are attributes we can split on
// Chose best place to split list
let splitIndex = findBestSplitIndex(listToSplit)
printfn "spliting at index %A" splitIndex
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 0. then Some(x) else None)
createTree leftSideSplit
createTree rightSideSplit
else
printfn "%s" "Not Pure, but can't split choose based on heuristics - Leaf(0 or 1)"