Related
I am using a .fasta file in F#. When I read it from disk, it is a sequence of strings. Each observation is usually 4-5 strings in length: 1st string is the title, then 2-4 strings of amino acids, and then 1 string of space. For example:
let filePath = #"/Users/XXX/sample_database.fasta"
let fileContents = File.ReadLines(filePath)
fileContents |> Seq.iter(fun x -> printfn "%s" x)
yields:
I am looking for a way to split each observation into its own collection using the OOB high order functions in F#. I do not want to use any mutable variables or for..each syntax. I thought Seq.chunkBySize would work -> but the size varies. Is there a Seq.chunkByCharacter?
Mutable variables are totally fine for this, provided their mutability doesn't leak into a wider context. Why exactly do you not want to use them?
But if you really want to go hardcore "functional", then the usual functional way of doing something like that is via fold.
Your folding state would be a pair of "blocks accumulated so far" and "current block".
At each step, if you get a non-empty string, you attach it to the "current block".
And if you get an empty string, that means the current block is over, so you attach the current block to the list of "blocks so far" and make the current block empty.
This way, at the end of folding you'll end up with a pair of "all blocks accumulated except the last one" and "last block", which you can glue together.
Plus, an optimization detail: since I'm going to do a lot of "attach a thing to a list", I'd like to use a linked list for that, because it has constant-time attaching. But then the problem is that it's only constant time for prepending, not appending, which means I'll end up with all the lists reversed. But no matter: I'll just reverse them again at the very end. List reversal is a linear operation, which means my whole thing would still be linear.
let splitEm lines =
let step (blocks, currentBlock) s =
match s with
| "" -> (List.rev currentBlock :: blocks), []
| _ -> blocks, s :: currentBlock
let (blocks, lastBlock) = Array.fold step ([], []) lines
List.rev (lastBlock :: blocks)
Usage:
> splitEm [| "foo"; "bar"; "baz"; ""; "1"; "2"; ""; "4"; "5"; "6"; "7"; ""; "8" |]
[["foo"; "bar"; "baz"]; ["1"; "2"]; ["4"; "5"; "6"; "7"]; ["8"]]
Note 1: You may have to address some edge cases depending on your data and what you want the behavior to be. For example, if there is an empty line at the very end, you'll end up with an empty block at the end.
Note 2: You may notice that this is very similar to imperative algorithm with mutating variables: I'm even talking about things like "attach to list of blocks" and "make current block empty". This is not a coincidence. In this purely functional version the "mutating" is accomplished by calling the same function again with different parameters, while in an equivalent imperative version you would just have those parameters turned into mutable memory cells. Same thing, different view. In general, any imperative iteration can be turned into a fold this way.
For comparison, here's a mechanical translation of the above to imperative mutation-based style:
let splitEm lines =
let mutable blocks = []
let mutable currentBlock = []
for s in lines do
match s with
| "" -> blocks <- List.rev currentBlock :: blocks; currentBlock <- []
| _ -> currentBlock <- s :: currentBlock
List.rev (currentBlock :: blocks)
To illustrate Fyodor's point about contained mutability, here's an example that is mutable as can be while still somewhat reasonable. The outer functional layer is a sequence expression, a common pattern demonstrated by Seq.scan in the F# source.
let chooseFoldSplit
folding (state : 'State)
(source : seq<'T>) : seq<'U[]> = seq {
let sref, zs = ref state, ResizeArray()
use ie = source.GetEnumerator()
while ie.MoveNext() do
let newState, uopt = folding !sref ie.Current
if newState <> !sref then
yield zs.ToArray()
zs.Clear()
sref := newState
match uopt with
| None -> ()
| Some u -> zs.Add u
if zs.Count > 0 then
yield zs.ToArray() }
// val chooseFoldSplit :
// folding:('State -> 'T -> 'State * 'U option) ->
// state:'State -> source:seq<'T> -> seq<'U []> when 'State : equality
There is mutability of a ref cell (equivalent to a mutable variable) and there is a mutable data structure; an alias for System.Collection.Generic.List<'T>, which allows appending at O(1) cost.
The folding function's signature 'State -> 'T -> 'State * 'U option is reminiscent of the folder of fold, except that it causes the result sequence to be split when its state changes. And it also spawns an option that denotes the next member for the current group (or not).
It would work fine without the conversion to a persistent array, as long as you iterate the resulting sequence lazily and only exactly once. Therefore we need to isolate the contents of the ResizeArrayfrom the outside world.
The simplest folding for your use case is negation of a boolean, but you could leverage it for more complex tasks like numbering your records:
[| "foo"; "1"; "2"; ""; "bar"; "4"; "5"; "6"; "7"; ""; "baz"; "8"; "" |]
|> chooseFoldSplit (fun b t ->
if t = "" then not b, None else b, Some t ) false
|> Seq.map (fun a ->
if a.Length > 1 then
{ Description = a.[0]; Sequence = String.concat "" a.[1..] }
else failwith "Format error" )
// val it : seq<FastaEntry> =
// seq [{Description = "foo";
// Sequence = "12";}; {Description = "bar";
// Sequence = "4567";}; {Description = "baz";
// Sequence = "8";}]
I went with recursion:
type FastaEntry = {Description:String; Sequence:String}
let generateFastaEntry (chunk:String seq) =
match chunk |> Seq.length with
| 0 -> None
| _ ->
let description = chunk |> Seq.head
let sequence = chunk |> Seq.tail |> Seq.reduce (fun acc x -> acc + x)
Some {Description=description; Sequence=sequence}
let rec chunk acc contents =
let index = contents |> Seq.tryFindIndex(fun x -> String.IsNullOrEmpty(x))
match index with
| None ->
let fastaEntry = generateFastaEntry contents
match fastaEntry with
| Some x -> Seq.append acc [x]
| None -> acc
| Some x ->
let currentChunk = contents |> Seq.take x
let fastaEntry = generateFastaEntry currentChunk
match fastaEntry with
| None -> acc
| Some y ->
let updatedAcc =
match Seq.isEmpty acc with
| true -> seq {y}
| false -> Seq.append acc (seq {y})
let remaining = contents |> Seq.skip (x+1)
chunk updatedAcc remaining
You also can use Regular Expression for these kind of stuff. Here is a solution that uses a regular expression to extract a whole Fasta Block at once.
type FastaEntry = {
Description: string
Sequence: string
}
let fastaRegexStr =
#"
^> # Line Starting with >
(.*) # Capture into $1
\r?\n # End-of-Line
( # Capturing in $2
(?:
^ # A Line ...
[A-Z]+ # .. containing A-Z
\*? \r?\n # Optional(*) followed by End-of-Line
)+ # ^ Multiple of those lines
)
(?:
(?: ^ [ \t\v\f]* \r?\n ) # Match an empty (whitespace) line ..
| # or
\z # End-of-String
)
"
(* Regex for matching one Fasta Block *)
let fasta = Regex(fastaRegexStr, RegexOptions.IgnorePatternWhitespace ||| RegexOptions.Multiline)
(* Whole file as a string *)
let content = System.IO.File.ReadAllText "fasta.fasta"
let entries = [
for m in fasta.Matches(content) do
let desc = m.Groups.[1].Value
(* Remove *, \r and \n from string *)
let sequ = Regex.Replace(m.Groups.[2].Value, #"\*|\r|\n", "")
{Description=desc; Sequence=sequ}
]
-module(tut).
-export([main/0]).
main() ->
folders("C:/Users/David/test/").
folders(PATH) ->
{_,DD} = file:list_dir(PATH),
A = [{H,filelib:is_dir(PATH ++ H)}|| H <-DD],
% R is a list of all folders inside PATH
R = [PATH++X|| {X,Y} <- A, Y =:= true],
io:fwrite("~p~n", [R]),
case R of
[] -> ok;
% How call again folders function with the first element of the list?
% And save the result in some kind of structure
end.
Sorry for the beginner question, but I'm still new to Erlang. I would like to know how I can call the function again until saves the results in a kind of list, tuple or structure...
Like:
[
{"C:/Users/David/test/log",
{"C:/Users/David/test/log/a", "C:/Users/David/test/log/b"}},
{"C:/Users/David/test/logb",
{"C:/Users/David/test/logb/1", "C:/Users/David/test/logb/2","C:/Users/David/test/logb/3"}},
]
Few things:
These 2 calls can be simplified.
A = [{H,filelib:is_dir(PATH ++ H)}|| H <-DD],
R = [PATH++X|| {X,Y} <- A, Y =:= true],
into
A = [H || H <- DD, filelib:is_dir(PATH ++ H) =:= true],
In terms of representation, sub-folders should be in list format, not tuple. It will be difficult to work with if they were tuples.
Sample structure: {Folder, [Subfolder1, Subfolder2, ...]}, where SubfolderX will have the same definition and structure, recursively.
Folders are like tree, so need to have recursive call here. Hope you are already familiar with the concept. Below is one way to do it using list comprehension - there are other ways anyway, e.g. by using lists:foldl function.
folders(PATH) ->
{_, DD} = file:list_dir(PATH),
A = [H || H <- DD, filelib:is_dir(PATH ++ "/" ++ H) =:= true],
%%io:format("Path: ~p, A: ~p~n", [Path, A]),
case A of
[] -> %%Base case, i.e. folder has no sub-folders -> stop here
{PATH, []};
_ -> %%Recursive case, i.e. folder has sub-folders -> call #folders
{PATH, [folders(PATH ++ "/" ++ H2) || H2 <- A]}
end.
For consistency reason, you need to call the main function without a forward slash at the end, as this will be added in the function itself.
Folders = folders("C:/Users/David/test"). %% <- without forward slash
A helper function pretty_print below can be used to visualize the output on the Erlang shell
Full code:
-export([folders/1]).
-export([main/0]).
main() ->
Folders = folders("C:/Users/David/test"),
pretty_print(Folders, 0),
ok.
folders(PATH) ->
{_, DD} = file:list_dir(PATH),
A = [H || H <- DD, filelib:is_dir(PATH ++ "/" ++ H) =:= true], %%please note the "/" is added here
%%io:format("Path: ~p, A: ~p~n", [Path, A]),
case A of
[] -> %%Base case, i.e. folder has no sub-folders -> stop here
{PATH, []};
_ -> %%Recursive case, i.e. folder has sub-folders -> call #folders
{PATH, [folders(PATH ++ "/" ++ H2) || H2 <- A]}
end.
pretty_print(Folders, Depth) ->
{CurrrentFolder, ListSubfolders} = Folders,
SignTemp = lists:duplicate(Depth, "-"),
case Depth of
0 -> Sign = SignTemp;
_ -> Sign = "|" ++ SignTemp
end,
io:format("~s~s~n", [Sign, CurrrentFolder]),
[pretty_print(Subfolder, Depth+1) || Subfolder <- ListSubfolders].
I'm pretty new to F#, and I'm trying to use recursion to solve a problem.
The function receives a string, and returns a bool. The string gets parsed, and evaluated. This is bool logic, so
(T|F) returns true
(T&(T&T)) returns true
((T|T)&(T&F)) returns false
(F) = returns false
My idea was that every time I found a ), replace the part of the string from the previous ( to that ) with the result of the Comparison match. Doing this over and over until only T or F remains, to return true or false.
EDIT:
I expect it to take the string, and keep swapping out what is in between the ( and ) with the result of the comparison until it comes down to a T or F. What is happening, is an error about an incomplete structured construct. The error is in the for loop.
As I am so new to this language, I'm not sure what I'm doing wrong. Do you see it?
let ComparisonSolver (comp:string) =
let mutable trim = comp
trim <- trim.Replace("(", "")
trim <- trim.Replace(")", "")
match trim with
| "T" -> "T"
| "F" -> "F"
| "!T" -> "F"
| "!F" -> "T"
| "T&T" -> "T"
| "F&F" -> "T"
| "T&F" -> "F"
| "F&T" -> "F"
| "T|T" -> "T"
| "F|F" -> "F"
| "T|F" -> "T"
| "F|T" -> "T"
| _ -> ""
let rec BoolParser arg =
let mutable args = arg
if String.length arg = 1 then
match arg with
| "T" -> true
| "F" -> false
else
let mutable ParseStart = 0
let endRange = String.length args
for letter in [0 .. endRange]
if args.[letter] = "(" then
ParseStart <- letter
else if args.[letter] = ")" then
args <- args.Replace(args.[ParseStart .. letter], ComparisonSolver args.[ParseStart .. letter])
BoolParser args
let result = BoolParser "(T)&(F)"
There are a few things you need to correct.
for letter in [0 .. endRange] is missing a do at the end of it - it should be for letter in [0 .. endRange] do
The if comparisons in the for loop are comparing chars with strings. You need to replace "(" and ")" with '(' and ')'
for letter in [0 .. endRange] will go out of range: In F# the array construct [x..y] will go from x to y inclusive. It's a bit like in C# if you had for (int i = 0; i <= array.Length; i++). In F# you can also declare loops like this: for i = 0 to endRange - 1 do.
for letter in [0 .. endRange] will go out of range again: It's going from 0 to endrange, which is the length of args. But args is getting shortened in the for loop, so it will eventually try to get a character from args that's out of range.
Now, the problem with the if..then..else statements, which is what I think you were looking at from the beginning.
if args.[letter] = '(' then
ParseStart <- letter
else if args.[letter] = ')' then
args <- args.Replace(args.[ParseStart .. letter], ComparisonSolver args.[ParseStart .. letter])
BoolParser args
Let's take the code within the two branches as two separate functions.
The first does ParseStart <- letter, which assigns letter to ParseStart. This function returns unit, which is F# equivalent of void.
The second does:
args <- args.Replace(args.[ParseStart .. letter], ComparisonSolver args.[ParseStart .. letter])
BoolParser args
This function returns a bool.
Now when you put them together in an if..then..else statement you have in one branch that results a unit and in the other in a bool. In this case it doesn't know which one to return, so it shows an "expression was expected to have type" error.
I strongly suspect that you wanted to call BoolParser args from outside
the for/if loop. But it's been indented so that F# treats it as part of the else if statement.
There are many ways to parse a boolean expression. It might be a good idea to look at the excellent library FParsec.
http://www.quanttec.com/fparsec/
Another way to implement parsers in F# is to use Active Patterns which can make for readable code
https://learn.microsoft.com/en-us/dotnet/fsharp/language-reference/active-patterns
It's hard to provide good error reporting through Active Patterns but perhaps you can find some inpiration from the following example:
let next s i = struct (s, i) |> Some
// Skips whitespace characters
let (|SkipWhitespace|_|) struct (s, i) =
let rec loop j =
if j < String.length s && s.[j] = ' ' then
loop (j + 1)
else
next s j
loop i
// Matches a specific character: ch
let (|Char|_|) ch struct (s, i) =
if i < String.length s && s.[i] = ch then
next s (i + 1)
else
None
// Matches a specific character: ch
// and skips trailing whitespaces
let (|Token|_|) ch =
function
| Char ch (SkipWhitespace ps) -> Some ps
| _ -> None
// Parses the boolean expressions
let parse s =
let rec term =
function
| Token 'T' ps -> Some (true, ps)
| Token 'F' ps -> Some (false, ps)
| Token '(' (Parse (v, Token ')' ps)) -> Some (v, ps)
| _ -> None
and opReducer p ch reducer =
let (|P|_|) ps = p ps
let rec loop l =
function
| Token ch (P (r, ps)) -> loop (reducer l r) ps
| Token ch _ -> None
| ps -> Some (l, ps)
function
| P (l, ps) -> loop l ps
| _ -> None
and andExpression ps = opReducer term '&' (&&) ps
and orExpression ps = opReducer andExpression '|' (||) ps
and parse ps = orExpression ps
and (|Parse|_|) ps = parse ps
match (struct (s, 0)) with
| SkipWhitespace (Parse (v, _)) -> Some v
| _ -> None
module Tests =
// FsCheck allows us to get better confidence in that the parser actually works
open FsCheck
type Whitespace =
| Space
type Ws = Ws of (Whitespace [])*(Whitespace [])
type Expression =
| Term of Ws*bool
| And of Expression*Ws*Expression
| Or of Expression*Ws*Expression
override x.ToString () =
let orPrio = 1
let andPrio = 2
let sb = System.Text.StringBuilder 16
let ch c = sb.Append (c : char) |> ignore
let token (Ws (l, r)) c =
sb.Append (' ', l.Length) |> ignore
sb.Append (c : char) |> ignore
sb.Append (' ', r.Length) |> ignore
let enclose p1 p2 f =
if p1 > p2 then ch '('; f (); ch ')'
else f ()
let rec loop prio =
function
| Term (ws, v) -> token ws (if v then 'T' else 'F')
| And (l, ws, r) -> enclose prio andPrio <| fun () -> loop andPrio l; token ws '&' ;loop andPrio r
| Or (l, ws, r) -> enclose prio orPrio <| fun () -> loop orPrio l ; token ws '|' ;loop orPrio r
loop andPrio x
sb.ToString ()
member x.ToBool () =
let rec loop =
function
| Term (_, v) -> v
| And (l, _, r) -> loop l && loop r
| Or (l, _, r) -> loop l || loop r
loop x
type Properties() =
static member ``Parsing expression shall succeed`` (expr : Expression) =
let expected = expr.ToBool () |> Some
let str = expr.ToString ()
let actual = str |> parse
expected = actual
let fscheck () =
let config = { Config.Quick with MaxTest = 1000; MaxRejected = 1000 }
Check.All<Properties> config
I have added the code as it stands. It can used on any piece of text I am doing some work in Erlang and I am getting an error message which I have included below.
exception error: no function clause matching string:to_lower({error,[80,75,3,4,20,0,6,0,8,0,0,0,33,0,2020], <<210,108,90,1,0,0,32,5,0,0,19,0,8,2,91,67,111,110,116,
101,110,116,95,84,121,...>>}) (string.erl, line 2084)
in function word_sort:readlines/1 (word_sort.erl, line 17).
I have also included an extract of my code below and I would appreciate if I could get pointers on where I am going wrong.
enter code here -module(word_sort).
enter code here-export([main/1]).
-export([unique/2]).
-export([sort/1]).
-export([readlines/1]).
-export([wordCount/3]).
% ========================================================== %
% Load the file and create a list %
% ========================================================== %
readlines(FileName) ->
io:format("~nLoading File : ~p~n", [FileName]),
{ok, File} = file:read_file(FileName),
Content = unicode:characters_to_list(File),
TokenList = string:tokens(string:to_lower(Content), " .,;:!?~/>'<{}£$%^&()#-=+_[]*#\\\n\r\"0123456789"),
main(TokenList).
% ========================================================== %
% Scan through the text file and find a list of unique words %
% ========================================================== %
main(TokenList) ->
UniqueList = unique(TokenList,[]),
io:format("~nSorted List : ~n"),
SortedList = sort(UniqueList), % Sorts UniqueList into SortedList%
io:format("~nSorted List : "),
io:format("~nWriting to file~n"),
{ok, F} = file:open("unique_words.txt", [write]),
register(my_output_file, F),
U = wordCounter(SortedList,TokenList,0),
io:format("~nUnique : ~p~n", [U]),
io:fwrite("~nComplete~n").
wordCounter([H|T],TokenList,N) ->
%io:fwrite("~p \t: ~p~n", [H,T]),
wordCount(H, TokenList, 0),
wordCounter(T,TokenList,N+1);
wordCounter([], _, N) -> N.
% =============================================================%
%Word count takes the unique word, and searches the original list for occurrences of that word%
%==============================================================%
wordCount(Word,[H|T],N) ->
case Word == H of % checks to see if H is in Seen List
true -> wordCount(Word, T, N+1); % if true, N_Seen = Seen List
false -> wordCount(Word, T, N) % if false, head appends Seen List.
end;
wordCount(Word,[],N) ->
io:fwrite("~p \t: ~p ~n", [N,Word]),
io:format(whereis(my_output_file), "~p \t: ~p ~n", [N,Word]).
%=================================================================================
unique([H|T],Seen) -> % Accepts List of numbers and Seen List
case lists:member(H, Seen) of % checks to see if H is in Seen List
true -> N_Seen = Seen; % if true, N_Seen = Seen List
false -> N_Seen = Seen ++ [H] % if false, head appends Seen List.
end,
unique(T,N_Seen); % calls uniques with Tail and Seen List.
%=================================================================================
unique([],Seen) -> Seen.
sort([Pivot|T]) ->
sort([ X || X <- T, X < Pivot]) ++
[Pivot] ++
sort([ X || X <- T, X >= Pivot]);
sort([]) -> [].
unicode:characters_to_list returned some error.
Variable 'Content' contains error message instead of data.
And string:to_lower() got error message as parameter instead of string.
You need just check what characters_to_list returns to you.
readlines(FileName) ->
io:format("~nLoading File : ~p~n", [FileName]),
{ok, File} = file:read_file(FileName),
case unicode:characters_to_list(File) of
Content when is_list(Content) ->
LCcontent = string:to_lower(Content),
TokenList = string:tokens(LCcontent,
" .,;:!?~/>'<{}£$%^&()#-=+_[]*#\\\n\r\"0123456789"),
main(TokenList);
Err ->
io:format("Cannot read file, got some unicode error ~p~n", [Err])
end.
erlang version 18.3
Got an strange error with Erlang ets:select/1
the following code will do select element from table and take them .
if I do
save(10), %% insert 10 data
remove(3) %% remove 3 data per time
it works
if I do
save(6007), %% insert more datas
remove(400) %% remove 400 data per time
it was bad arg in ets:select(Cont) also, it was not the in the first or second loop, but was always there.
any suggestion?
-record(item, {name, age}).
%% start the table
start() ->
ets:new(example_table, [public, {keypos, 2},
named_table,
{read_concurrency, true},
{write_concurrency, true}]).
%% insert n demo data
save(Limit) ->
All = lists:seq(1 ,Limit),
All_rec = [#item{name = {<<"demo">>, integer_to_binary(V)} , age = V} || V <- All],
ets:insert(example_table, All_rec).
%% remove all data, n data per select
remove(Limit) ->
M_head = #item{name = '$1', _ = '_'},
M_guards = [],
M_result = ['$1'],
M_spec = [{M_head, M_guards, M_result}],
case ets:select(example_table, M_spec, Limit) of
'$end_of_table' ->
0;
{Keys, Cont} ->
remove(example_table, Keys, Cont, 0, [])
end.
remove(Table, [], Cont, Count, _Acc) ->
case ets:select(Cont) of
'$end_of_table' ->
Count;
{Keys, Cont_1} ->
remove(Table, Keys, Cont_1, Count, [])
end;
remove(Table,[Key | T], Cont, Count, Acc) ->
case ets:take(example_table, Key) of
[] ->
remove(Table, T, Cont, Count, Acc);
[Rec] ->
io:format("Rec [~p] ~n", [Rec]),
remove(Table, T, Cont, Count + 1, [Rec | Acc])
end.
stack trace
4> example_remove:save(6007).
true
5> example_remove:remove(500).
** exception error: bad argument
in function ets:select/1
called as ets:select({example_table,304,500,<<>>,
[{<<"demo">>,<<"2826">>},
{<<"demo">>,<<"3837">>},
{<<"demo">>,<<"5120">>},
{<<"demo">>,<<"878">>},
{<<"demo">>,<<"1195">>},
{<<"demo">>,<<"1256">>},
{<<"demo">>,<<"1449">>},
{<<"demo">>,<<"5621">>},
{<<"demo">>,<<"5768">>}],
9})
in call from example_remove:remove/5 (d:/workspace/simple-cache/src/example_remove.erl, line 47)
I believe this happens because you simultaneously iterate over the table and modify it.
I suggest wrapping main remove cycle with guards of safe_fixtable
remove(Limit) ->
ets:safe_fixtable(example_table, true),
M_head = #item{name = '$1', _ = '_'},
M_guards = [],
M_result = ['$1'],
M_spec = [{M_head, M_guards, M_result}],
R = case ets:select(example_table, M_spec, Limit) of
'$end_of_table' ->
0;
{Keys, Cont} ->
remove(example_table, Keys, Cont, 0, [])
end,
ets:safe_fixtable(example_table, false),
R.