Mixfix operators with Fparsec - parsing

How could we parse mixfix operators with FParsec?
I tried to create a class type that looks like OperatorPrecedenceParser:
let identifier = many1Satisfy (fun c -> isLetter c || isDigit c || c = ''')
let symbol =
[ '!'; '#'; '&'; ','; '%'; '^'; '.';
'§'; '*'; '°'; '$'; '~'; ':'; '-';
'+'; '='; '?'; '/'; '>'; '<'; '|'; ]
module Operator =
type public Fixity =
| Left | Right | Neutral
let defaultPrecedence = 3
type public Mixfix =
{ nameParts: Identifier list; // the components of the mixfix function
fullname: Identifier; // the full operator name
mutable fixity: Fixity; // the fixity of the function
mutable prec: int; // operator precedence
arity: int } // number of arguments (starts to 1)
with member this.update fixity prec =
this.fixity <- fixity
this.prec <- prec
let private set'nameParts ids =
List.filter (fun id -> id <> "_") ids
let private set'fullname (id: Identifier list) =
System.String.Join("", id)
/// by default value
let private set'fixity (id: Identifier list) =
if id.First() = "_"
then Left
elif id.Last() <> "_"
then Neutral
else Right
let private set'arity (id: Identifier list) =
int (List.countWith (fun s -> s = "_") id)
let identifier = attempt identifier <|> operator
let makeApp expr1 expr2 = App(expr1, expr2)
let makeApps expr1 (exprs: SL list) =
List.fold
(fun acc e -> makeApp acc e)
(makeApp expr1 (exprs.First()))
(exprs
|> List.rev
|> List.dropLast
|> List.rev)
type MixFoxOp (ptv: Parser<_, _> option) =
member private this.operators : Mixfix list ref = ref []
member private this.termParser : Parser<_, _> list ref =
if ptv.IsNone
then ref []
else ref [ptv.Value]
member private this.termValue = choice !this.termParser
member private this.addMixFix (mixfix: Mixfix) =
this.operators.Value <- this.operators.Value # [mixfix]
member public this.contains id =
List.exists (fun s -> s.fullname = id) this.operators.Value
member private this.addMixFixOperator id fixity prec =
if this.contains (set'fullname id) = false
then this.addMixFix
{ nameParts = set'nameParts id;
fullname = set'fullname id;
fixity = fixity;
prec = prec;
arity = set'arity id }
else failwithf "Already defined function"
member public this.addOperator (id: Identifier list) (fixity: Fixity option) (prec: int option) =
this.addMixFixOperator id
(if fixity.IsNone then set'fixity id else fixity.Value)
(if prec.IsNone then defaultPrecedence else prec.Value)
member public this.setTermValue ptvalue =
this.termParser := !this.termParser # [ptvalue]
()
member public this.expressionParser =
parse {
let mutable nameParts : Identifier list = []
let mutable valueParts : Value list = []
let addNamePartRet name =
nameParts <- nameParts # [name]
preturn name
let addValuePartRet value =
valueParts <- valueParts # [value]
nameParts <- nameParts # ["_"]
preturn value
do! (attempt ((attempt identifier <|> symbol) >>= fun id -> addNamePartRet id) <|> (ws >>% "")) >>?
sepEndBy
(bws (this.termValue (* <|> this.expressionParser*)) >>= fun x -> addValuePartRet x)
(bws identifier
>>= fun id -> addNamePartRet id) >>% ()
let fullname = set'fullname nameParts
if this.contains fullname
then return makeApps (Var fullname) valueParts
elif valueParts.Length = 1 && nameParts.First() = "_"
then return valueParts.First()
else fail (sprintf "Unknown mixfix function: `%s`" fullname)
}
With a use such as this:
let opp = new Operator.MixFixOp(Some value)
opp.addOperator ["|"; "_"; "|"] None None // abscisse
opp.addOperator ["_"; "!"] None None // factorial
opp.addOperator ["_"; ","; "_"] None None // tuple
opp.addOperator ["if"; "_"; "then"; "_"; "else"; "_"]
// ...
let test = run (opp.expressionParser .>>? eof) "|32|"
And for example:
type Expr =
| Var of string
| App of Expr * Expr
| Int of int
let pint = pint32 |>> Int
let pvar = identifier |>> Var
let value' = attempt pint <|> pvar
let app = chainl1 value' (spaces1 >>% fun x y -> App(x, y))
let value = app
let opp = new Operator.MixFixOp(Some value)
...
But, already, it is not very satisfying as a method, because it requires knowing in advance which parser to use as an operation term, so we can't use operators in these parsers (a setTermValue method has been added in the type, but does not work, indeed, with each use, termParser remains empty, does not update), then we do not know how to use expressionParser without getting an infinite loop, it doesn't manage fixity or precedence, and finally, there are conflicts with the parsers to use as a term, if we expect for example an identifier, as in the examples.
In the long run, thanks to this type-class-parser, I would like to be able to parse expressions such as:
3 + 1 + 4
|32!|
if x > y then 0 else 1
How could I improve this module, and make it operational?
I would be very grateful if you could help me :)

Related

Indentation has the last word on error messages (UserState) - FParsec

I have a small indentation management module with FParsec (found here); it works wonderfully well, but the only concern is that, when an error is encountered in the stream to be parsed, most of the time, FParsec returns the error message from the indentation manager, i.e. the UserState (correct me if I'm wrong on this point); which is problematic because it makes the errors very blurry, and all the same... How can I display indentation errors only when they are necessary?
Here is the module used for indentation:
module IndentParser
open FParsec
type Indentation =
| Fail
| Any
| Greater of Position
| Exact of Position
| AtLeast of Position
| StartIndent of Position
with
member this.Position = match this with
| Any | Fail -> None
| Greater p -> Some p
| Exact p -> Some p
| AtLeast p -> Some p
| StartIndent p -> Some p
type IndentState<'T> = { Indent : Indentation; UserState : 'T }
type CharStream<'T> = FParsec.CharStream<IndentState<'T>>
type IndentParser<'T, 'UserState> = Parser<'T, IndentState<'UserState>>
let indentState u = {Indent = Any; UserState = u}
let runParser p u s = runParserOnString p (indentState u) "" s
let runParserOnFile p u path = runParserOnFile p (indentState u) path System.Text.Encoding.UTF8
let getIndentation : IndentParser<_,_> =
fun stream -> match stream.UserState with
| {Indent = i} -> Reply i
let getUserState : IndentParser<_,_> =
fun stream -> match stream.UserState with
| {UserState = u} -> Reply u
let putIndentation newi : IndentParser<unit, _> =
fun stream ->
stream.UserState <- {stream.UserState with Indent = newi}
Reply(Unchecked.defaultof<unit>)
let failf fmt = fail << sprintf fmt
let acceptable i (pos : Position) =
match i with
| Any _ -> true
| Fail -> false
| Greater bp -> bp.Column < pos.Column
| Exact ep -> ep.Column = pos.Column
| AtLeast ap -> ap.Column <= pos.Column
| StartIndent _ -> true
let nestableIn i o =
match i, o with
| Greater i, Greater o -> o.Column < i.Column
| Greater i, Exact o -> o.Column < i.Column
| Exact i, Exact o -> o.Column = i.Column
| Exact i, Greater o -> o.Column <= i.Column
| _, _ -> true
let tokeniser p = parse {
let! pos = getPosition
let! i = getIndentation
if acceptable i pos then return! p
else return! fail "incorrect indentation"
}
let nestP i o p = parse {
do! putIndentation i
let! x = p
do! notFollowedBy (tokeniser anyChar) <?> (sprintf "unterminated %A" i)
do! putIndentation o
return x
}
let nest indentor p = parse {
let! outerI = getIndentation
let! curPos = getPosition
let innerI = indentor curPos
if nestableIn innerI outerI
then return! nestP innerI outerI p
else return! nestP Fail outerI p
}
let nestWithPos indentor pos p = parse {
let! outerI = getIndentation
let innerI = indentor pos
if nestableIn innerI outerI
then return! nestP innerI outerI p
else return! nestP Fail outerI p
}
let neglectIndent p = parse {
let! o = getIndentation
do! putIndentation Any
let! x = p
do! putIndentation o
return x
}
let checkIndent<'u> : IndentParser<unit, 'u> = tokeniser (preturn ())
let indented<'a,'u> i (p : Parser<'a,_>) : IndentParser<_, 'u> = parse {
do! putIndentation i
do! spaces
return! tokeniser p
}
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is on the same line as the defined position (`pos`).
let exact<'a,'u> pos p: IndentParser<'a, 'u> = indented (Exact pos) p
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is further away than the defined position (`pos`).
let greater<'a,'u> pos p: IndentParser<'a, 'u> = indented (Greater pos) p
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is on the same OR line further than the defined position (`pos`).
let atLeast<'a,'u> pos p: IndentParser<'a, 'u> = indented (AtLeast pos) p
/// Simply check if the parser (`p`) exists, regardless of its position in the text to be analyzed.
let any<'a,'u> pos p: IndentParser<'a, 'u> = indented Any p
let newline<'u> : IndentParser<unit, 'u> = many (skipAnyOf " \t" <?> "whitespace") >>. newline |>> ignore
let rec blockOf p = parse {
do! spaces
let! pos = getPosition
let! x = exact pos p
let! xs = attempt (exact pos <| blockOf p) <|> preturn []
return x::xs
}
and here is an example of the problem encountered:
open FParsec
open IndentParser
// ---------- AST ----------
type Statement
= Let of string * Expr
and Expr
= Tuple of Expr list
| Literal of Literal
and Literal
= Int of int
| Float of float
| Char of char
// ---------- Parser ----------
let inline pstr's s = stringReturn s s <?> sprintf "`%s`" s
let inline pstr'u s = stringReturn s () <?> sprintf "`%s`" s
let identifier = manySatisfy (fun c -> isLetter c || c = ''')
let comment = pstr'u "//" >>. skipRestOfLine true <?> ""
let numberFormat =
NumberLiteralOptions.AllowBinary
||| NumberLiteralOptions.AllowMinusSign
||| NumberLiteralOptions.AllowHexadecimal
||| NumberLiteralOptions.AllowOctal
||| NumberLiteralOptions.AllowPlusSign
||| NumberLiteralOptions.AllowFraction
let number<'u> : IndentParser<Literal, 'u> =
(numberLiteral numberFormat "number" |>> fun nl ->
if nl.IsInteger then Int(int nl.String)
else Float(float nl.String))
let char<'u> : IndentParser<Literal, 'u> =
((between (pstr'u "'") (pstr'u "'")
(satisfy (fun c -> c <> '\'')) <?> "char literal") |>> Char)
let rec let'parser =
parse { let! pos = getPosition
do! exact pos (pstr'u "let" <?> "let statement")
let! name = greater pos identifier <?> "identifier"
do! greater pos (pstr'u "=" <?> "value assignment")
let! value = greater pos expression
return Let(name, value) }
and tuple'parser =
parse { let! pos = getPosition
do! exact pos (pstr'u "(" <?> "tuple")
let! uplets = greater pos (sepBy1 expression (pstr'u ","))
do! greater pos (pstr'u ")" <?> "right parenthese")
return Tuple uplets }
and literal'parser = attempt number <|> char |>> Literal
and expression =
spaces >>? (attempt tuple'parser <|> literal'parser)
and statement = spaces >>? let'parser .>>? spaces .>>? (attempt comment <|> (spaces >>% ()))
// ---------- Test ----------
System.Console.Clear()
let res = runParser (spaces >>? blockOf statement .>>? (spaces .>>? eof)) () #"
let foo = (0, 1) // it works well
let bar = 887 // it works well
let oof = 'x' // it works well
let rab = // it fail with 'incorrect indentation' (without this comment)
let ofo = (0, 2, // it fail with 'incorrect indentation' (without this comment)
"
printfn "%A" res
It's really annoying...
Would someone explain to me how to solve this problem?

Parsing the signature of a function - Error with the arrow type - FParsec + indentation

I had already asked a question about how to parse the arrow type, this is not a duplicate, but rather an adaptation with the indentation based syntax.
Indeed, I would like to be able to analyze a syntax close to that of the ML family languages. I also introduced the syntax of the type signature of a function in Haskell, so this:
myFunction :: atype
My parser works very well for all kinds of signature types, except the arrow type when it is "alone":
foo :: a // ok
foo :: [a] // ok
foo :: (a, a) // ok
foo :: [a -> a] // ok
foo :: (a -> a, a) // ok
foo :: a -> a // error
Same for the creation of functions (to make it simple, I just expected a number as a value):
foo: a = 0 // ok
foo: [a] = 0 // ok
foo: (a, a) = 0 // ok
foo: [a -> a] = 0 // ok
foo: (a -> a, a) = 0 // ok
foo: a -> a = 0 // error
Without the indentation, all these cases work a priori.
I tried a module to parse the indentation other than the FParsec wiki, just to try and evaluate a little. It comes from there, and here is the necessary and sufficient module code for the question:
module IndentParser =
type Indentation =
| Fail
| Any
| Greater of Position
| Exact of Position
| AtLeast of Position
| StartIndent of Position
with
member this.Position = match this with
| Any | Fail -> None
| Greater p -> Some p
| Exact p -> Some p
| AtLeast p -> Some p
| StartIndent p -> Some p
type IndentState<'T> = { Indent : Indentation; UserState : 'T }
type CharStream<'T> = FParsec.CharStream<IndentState<'T>>
type IndentParser<'T, 'UserState> = Parser<'T, IndentState<'UserState>>
let indentState u = {Indent = Any; UserState = u}
let runParser p u s = runParserOnString p (indentState u) "" s
let runParserOnFile p u path = runParserOnFile p (indentState u) path System.Text.Encoding.UTF8
let getIndentation : IndentParser<_,_> =
fun stream -> match stream.UserState with
| {Indent = i} -> Reply i
let getUserState : IndentParser<_,_> =
fun stream -> match stream.UserState with
| {UserState = u} -> Reply u
let putIndentation newi : IndentParser<unit, _> =
fun stream ->
stream.UserState <- {stream.UserState with Indent = newi}
Reply(Unchecked.defaultof<unit>)
let failf fmt = fail << sprintf fmt
let acceptable i (pos : Position) =
match i with
| Any _ -> true
| Fail -> false
| Greater bp -> bp.Column < pos.Column
| Exact ep -> ep.Column = pos.Column
| AtLeast ap -> ap.Column <= pos.Column
| StartIndent _ -> true
let tokeniser p = parse {
let! pos = getPosition
let! i = getIndentation
if acceptable i pos then return! p
else return! failf "incorrect indentation at %A" pos
}
let indented<'a,'u> i (p : Parser<'a,_>) : IndentParser<_, 'u> = parse {
do! putIndentation i
do! spaces
return! tokeniser p
}
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is on the same line as the defined position (`pos`).
let exact<'a,'u> pos p: IndentParser<'a, 'u> = indented (Exact pos) p
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is further away than the defined position (`pos`).
let greater<'a,'u> pos p: IndentParser<'a, 'u> = indented (Greater pos) p
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is on the same OR line further than the defined position (`pos`).
let atLeast<'a,'u> pos p: IndentParser<'a, 'u> = indented (AtLeast pos) p
/// Simply check if the parser (`p`) exists, regardless of its position in the text to be analyzed.
let any<'a,'u> pos p: IndentParser<'a, 'u> = indented Any p
let newline<'u> : IndentParser<unit, 'u> = many (skipAnyOf " \t" <?> "whitespace") >>. newline |>> ignore
let rec blockOf p = parse {
do! spaces
let! pos = getPosition
let! x = exact pos p
let! xs = attempt (exact pos <| blockOf p) <|> preturn []
return x::xs
}
Now, here is the code I'm trying to fix for the problem I encountered:
module Parser =
open IndentParser
type Identifier = string
type Type =
| Typename of Identifier
| Tuple of Type list
| List of Type
| Arrow of Type * Type
| Infered
type Expression =
| Let of Identifier * Type * int
| Signature of Identifier * Type
type Program = Program of Expression list
// Utils -----------------------------------------------------------------
let private ws = spaces
/// All symbols granted for the "opws" parser
let private allowedSymbols =
['!'; '#'; '#'; '$'; '%'; '+'; '&'; '*'; '('; ')'; '-'; '+'; '='; '?'; '/'; '>'; '<'; '|']
/// Parse an operator and white spaces around it: `ws >>. p .>> ws`
let inline private opws str =
ws >>.
(tokeniser (pstring str >>?
(nextCharSatisfiesNot
(isAnyOf (allowedSymbols # ['"'; '''])) <?> str))) .>> ws
let private identifier =
(many1Satisfy2L isLetter
(fun c -> isLetter c || isDigit c) "identifier")
// Types -----------------------------------------------------------------
let rec typename = parse {
let! name = ws >>. identifier
return Type.Typename name
}
and tuple_type = parse {
let! types = between (opws "(") (opws ")") (sepBy (ws >>. type') (opws ","))
return Type.Tuple types
}
and list_type = parse {
let! ty = between (opws "[") (opws "]") type'
return Type.List ty
}
and arrow_type =
chainr1 (typename <|> tuple_type <|> list_type) (opws "->" >>% fun t1 t2 -> Arrow(t1, t2))
and type' =
attempt arrow_type <|>
attempt typename <|>
attempt tuple_type <|>
attempt list_type
// Expressions -----------------------------------------------------------------
let rec private let' = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws ":")
let! ty = greater pos type'
do! greater pos (opws "=")
let! value = greater pos pint32
return Expression.Let(id, ty, value)
}
and private signature = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws "::")
let! ty = greater pos type'
return Expression.Signature(id, ty)
}
and private expression =
attempt let'
and private expressions = blockOf expression <?> "expressions"
let private document = ws >>. expressions .>> ws .>> eof |>> Program
let private testType = ws >>. type' .>> ws .>> eof
let rec parse code =
runParser document () code
|> printfn "%A"
open Parser
parse #"
foo :: a -> a
"
Here is the error message obtained:
There is no reference to indentation in the error message, that's what troubles also, because if I implement an identical parser, except for indentation parsing, it works.
Could you put me on the right way?
EDIT
Here is the "fixed" code (the use of the function signature parser was missing + removal of unnecessary attempt):
open FParsec
// module IndentParser
module Parser =
open IndentParser
type Identifier = string
type Type =
| Typename of Identifier
| Tuple of Type list
| List of Type
| Arrow of Type * Type
| Infered
type Expression =
| Let of Identifier * Type * int
| Signature of Identifier * Type
type Program = Program of Expression list
// Utils -----------------------------------------------------------------
let private ws = spaces
/// All symbols granted for the "opws" parser
let private allowedSymbols =
['!'; '#'; '#'; '$'; '%'; '+'; '&'; '*'; '('; ')'; '-'; '+'; '='; '?'; '/'; '>'; '<'; '|']
/// Parse an operator and white spaces around it: `ws >>. p .>> ws`
let inline private opws str =
ws >>.
(tokeniser (pstring str >>?
(nextCharSatisfiesNot
(isAnyOf (allowedSymbols # ['"'; '''])) <?> str))) .>> ws
let private identifier =
(many1Satisfy2L isLetter
(fun c -> isLetter c || isDigit c) "identifier")
// Types -----------------------------------------------------------------
let rec typename = parse {
let! name = ws >>. identifier
return Type.Typename name
}
and tuple_type = parse {
let! types = between (opws "(") (opws ")") (sepBy (ws >>. type') (opws ","))
return Type.Tuple types
}
and list_type = parse {
let! ty = between (opws "[") (opws "]") type'
return Type.List ty
}
and arrow_type =
chainr1 (typename <|> tuple_type <|> list_type) (opws "->" >>% fun t1 t2 -> Arrow(t1, t2))
and type' =
attempt arrow_type <|>
typename <|>
tuple_type <|>
list_type
// Expressions -----------------------------------------------------------------
let rec private let' = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws ":")
let! ty = greater pos type'
do! greater pos (opws "=")
let! value = greater pos pint32
return Expression.Let(id, ty, value)
}
and private signature = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws "::")
let! ty = greater pos type'
return Expression.Signature(id, ty)
}
and private expression =
attempt let' <|>
signature
and private expressions = blockOf expression <?> "expressions"
let private document = ws >>. expressions .>> ws .>> eof |>> Program
let private testType = ws >>. type' .>> ws .>> eof
let rec parse code =
runParser document () code
|> printfn "%A"
open Parser
System.Console.Clear()
parse #"
foo :: a -> a
"
So, here are the new error messages:
and
At the moment, your code is failing on the :: signature because you haven't actually used your signature parser anywhere. You have defined expression as attempt let', but I think you meant to write attempt signature <|> attempt let'. That is why your test is failing on the second colon of ::, because it's matching the single colon of a let' and then not expecting the second colon.
Also, I think your chaining multiple attempt combinators together like attempt a <|> attempt b <|> attempt c is going to cause you problems somewhere, and that you should remove the final attempt, e.g., attempt a <|> attempt b <|> c. If you use attempt in all the possible choices, you'll end up with a parser that can succeed by parsing nothing, which is often not what you intended.
Update: I think I've found the cause and the solution.
Summary: In your opws parser, replace the line ws >>. with ws >>?.
Explanation: In all the sepBy variants (and chainr1 is a sepBy variant), FParsec expects that the separator parser will either succeed, or will fail without consuming input. (If the separator fails after consuming input, FParsec considers the entire sepBy-family parser to have failed in its entirety.) But your opws parser will consume whitespace, then fail if it doesn't find a correct operator. So when your arrow_type parser parses the string a -> a followed by a newline, the arrow after the first a is correctly matched, then it sees the second a, and then it tries to find another arrow. Since what follows next is at least one whitespace character (newlines count as whitespace), the opws "->" parser ends up consuming some input before it fails. (It fails because after that whitespace is the end of the file, not another -> token). This makes the chainr1 combinator fail, so arrow_type fails and your a -> a parser ends up being parsed as a single type a. (At which point the arrow is now unexpected).
By using >>? in your definition of opws, you ensure that if the second part of the parser fails, it will backtrack to before it matched any whitespace. That ensures that the separator parser will fail without matching input and without advancing the parse position in the character stream. Therefore, the chainr1 parser succeeds after parsing a -> a and you get the expected results.

Convert a String to Custom Type Haskell [duplicate]

This question already has answers here:
Reading file into types - Haskell
(3 answers)
Closed 6 years ago.
Hello there I am trying to convert this String
Blade Runner,Ridley Scott,1982,Amy,5,Bill,8,Ian,7,Kevin,9,Emma,4,Sam,7,Megan,4​
To a Film Type
type UserRatings = (String,Int)
type Film = (Title, Director, Year , [UserRatings])
from a text file that contains 25 films
this is what i tried to do
maybeReadTup :: String ->(String, Int)
maybeReadTup s = do
[(n, [c])] <- return $ reads s
return [(n, [c])]
parseLines :: [String] -> Film
parseLines list
| isInt(list !! 3) = (list !! 0,(list !! 1), read (list !! 2), maybeReadTup [ (list!!1,read (list !! 2) )])
isInt :: String ->Bool
isInt[] = True
isInt (x:xs)
| isNumber x = True && isInt xs
| otherwise = False
parseChars :: String -> String -> [String]
parseChars [] _ = []
parseChars (x:xs) stringCount
| x == ',' = [stringCount] ++ parseChars xs ""
| otherwise = (parseChars xs (stringCount ++ [x]))
parseAll :: [String] -> [Film]
parseAll [] = []
parseAll (x:xs) = parseLines (parseChars x "") : (parseAll xs)
But i get wrong the types can someone please help me parse this UserRatings tuple type [(String,Int)] ? And can you help me understand how parseLines work? I'm new in Haskell
Here's a solution using Text.Parsec:
import Text.Parsec
import Text.Parsec.String
type UserRatings = (String, Int)
type Title = String
type Director = String
type Year = Int
type Film = (Title, Director, Year, [UserRatings])
str :: Parser String
str = many1 (noneOf ",")
int :: Parser Int
int = read <$> many1 digit
tup :: Parser UserRatings
tup = do user <- str
_ <- oneOf ","
rating <- int
return (user, rating)
parser :: Parser Film
parser = do title <- str
_ <- oneOf ","
director <- str
_ <- oneOf ","
year <- int
_ <- oneOf ","
ratings <- sepBy tup (oneOf ",")
eof
return (title, director, year, ratings)
testString :: String
testString = "Blade Runner,Ridley Scott,1982,Amy,5,Bill,8,Ian,7,Kevin,9,Emma,4,Sam,7,Megan,4"
main :: IO ()
main = print $ runParser parser () "testString" testString

Use FParsec to parse a self-describing input

I'm using FParsec to parse an input that describes its own format. For example, consider this input:
int,str,int:4,'hello',3
The first part of the input (before the colon) describes the format of the second part of the input. In this case, the format is int, str, int, which means that the actual data consists of three comma-separated values of the given types, so the result should be 4, "hello", 3.
What is the best way to parse something like this with FParsec?
I've pasted my best effort below, but I'm not happy with it. Is there a better way to do this that is cleaner, less stateful, and less reliant on the parse monad? I think this depends on smarter management of UserState, but I don't know how to do it. Thanks.
open FParsec
type State = { Formats : string[]; Index : int32 }
with static member Default = { Formats = [||]; Index = 0 }
type Value =
| Integer of int
| String of string
let parseFormat : Parser<_, State> =
parse {
let! formats =
sepBy
(pstring "int" <|> pstring "str")
(skipString ",")
|>> Array.ofList
do! updateUserState (fun state -> { state with Formats = formats })
}
let parseValue format =
match format with
| "int" -> pint32 |>> Integer
| "str" ->
between
(skipString "'")
(skipString "'")
(manySatisfy (fun c -> c <> '\''))
|>> String
| _ -> failwith "Unexpected"
let parseValueByState =
parse {
let! state = getUserState
let format = state.Formats.[state.Index]
do! setUserState { state with Index = state.Index + 1}
return! parseValue format
}
let parseData =
sepBy
parseValueByState
(skipString ",")
let parse =
parseFormat
>>. skipString ":"
>>. parseData
[<EntryPoint>]
let main argv =
let result = runParserOnString parse State.Default "" "int,str,int:4,'hello',3"
printfn "%A" result
0
There seem to be several problems with the original code, so I took my liberty to rewrite it from scratch.
First, several library functions that may appear useful in other FParsec-related projects:
/// Simple Map
/// usage: let z = Map ["hello" => 1; "bye" => 2]
let (=>) x y = x,y
let makeMap x = new Map<_,_>(x)
/// A handy construct allowing NOT to write lengthy type definitions
/// and also avoid Value Restriction error
type Parser<'t> = Parser<'t, UserState>
/// A list combinator, inspired by FParsec's (>>=) combinator
let (<<+) (p1: Parser<'T list>) (p2: Parser<'T>) =
p1 >>= fun x -> p2 >>= fun y -> preturn (y::x)
/// Runs all parsers listed in the source list;
/// All but the trailing one are also combined with a separator
let allOfSepBy separator parsers : Parser<'T list> =
let rec fold state =
function
| [] -> pzero
| hd::[] -> state <<+ hd
| hd::tl -> fold (state <<+ (hd .>> separator)) tl
fold (preturn []) parsers
|>> List.rev // reverse the list since we appended to the top
Now, the main code. The basic idea is to run parsing in three steps:
Parse out the keys (which are plain ASCII strings)
Map these keys to actual Value parsers
Run these parsers in order
The rest seems to be commented within the code. :)
/// The resulting type
type Output =
| Integer of int
| String of string
/// tag to parser mappings
let mappings =
[
"int" => (pint32 |>> Integer)
"str" => (
manySatisfy (fun c -> c <> '\'')
|> between (skipChar ''') (skipChar ''')
|>> String
)
]
|> makeMap
let myProcess : Parser<Output list> =
let pKeys = // First, we parse out the keys
many1Satisfy isAsciiLower // Parse one key; keys are always ASCII strings
|> sepBy <| (skipChar ',') // many keys separated by comma
.>> (skipChar ':') // all this with trailing semicolon
let pValues = fun keys ->
keys // take the keys list
|> List.map // find the required Value parser
// (NO ERROR CHECK for bad keys)
(fun p -> Map.find p mappings)
|> allOfSepBy (skipChar ',') // they must run in order, comma-separated
pKeys >>= pValues
Run on string: int,int,str,int,str:4,42,'hello',3,'foobar'
Returned: [Integer 4; Integer 42; String "hello"; Integer 3; String "foobar"]
#bytebuster beat me to it but I still post my solution. The technique is similar to #bytebuster.
Thanks for an interesting question.
In compilers I believe the preferred technique is to parse the text into an AST and on that run a type-checker. For this example a potentially simpler technique would be that parsing the type definitions returns a set of parsers for the values. These parsers are then applied on the rest of the string.
open FParsec
type Value =
| Integer of int
| String of string
type ValueParser = Parser<Value, unit>
let parseIntValue : Parser<Value, unit> =
pint32 |>> Integer
let parseStringValue : Parser<Value, unit> =
between
(skipChar '\'')
(skipChar '\'')
(manySatisfy (fun c -> c <> '\''))
<?> "string"
|>> String
let parseValueParser : Parser<ValueParser, unit> =
choice
[
skipString "int" >>% parseIntValue
skipString "str" >>% parseStringValue
]
let parseValueParsers : Parser<ValueParser list, unit> =
sepBy1
parseValueParser
(skipChar ',')
// Runs a list of parsers 'ps' separated by 'sep' parser
let sepByList (ps : Parser<'T, unit> list) (sep : Parser<unit, unit>) : Parser<'T list, unit> =
let rec loop adjust ps =
match ps with
| [] -> preturn []
| h::t ->
adjust h >>= fun v -> loop (fun pp -> sep >>. pp) t >>= fun vs -> preturn (v::vs)
loop id ps
let parseLine : Parser<Value list, unit> =
parseValueParsers .>> skipChar ':' >>= (fun vps -> sepByList vps (skipChar ',')) .>> eof
[<EntryPoint>]
let main argv =
let s = "int,str,int:4,'hello',3"
let r = run parseLine s
printfn "%A" r
0
Parsing int,str,int:4,'hello',3 yields Success: [Integer 4; String "hello";Integer 3].
Parsing int,str,str:4,'hello',3 (incorrect) yields:
Failure:
Error in Ln: 1 Col: 23
int,str,str:4,'hello',3
^
Expecting: string
I rewrote #FuleSnabel's sepByList as follows to help me understand it better. Does this look right?
let sepByList (parsers : Parser<'T, unit> list) (sep : Parser<unit, unit>) : Parser<'T list, unit> =
let rec loop adjust parsers =
parse {
match parsers with
| [] -> return []
| parser :: tail ->
let! value = adjust parser
let! values = loop (fun parser -> sep >>. parser) tail
return value :: values
}
loop id parsers

F#, FParsec, and Updating UserState

Okay, since my last question elicited no responses, I'm forging ahead in a different direction. Lol!
I can't find any examples beyond the official documentation on managing user state, or accessing the results of a prior parser.
N.b. This code does not compile.
namespace MultipartMIMEParser
open FParsec
open System.IO
type Header = { name : string
; value : string
; addl : (string * string) list option }
type Content = Content of string
| Post of Post list
and Post = { headers : Header list
; content : Content }
type private UserState = { Boundary : string }
with static member Default = { Boundary="" }
module internal P =
let ($) f x = f x
let undefined = failwith "Undefined."
let ascii = System.Text.Encoding.ASCII
let str cs = System.String.Concat (cs:char list)
let makeHeader ((n,v),nvps) = { name=n; value=v; addl=nvps}
let runP p s = match runParserOnStream p UserState.Default "" s ascii with
| Success (r,_,_) -> r
| Failure (e,_,_) -> failwith (sprintf "%A" e)
let blankField = parray 2 newline
let delimited d e =
let pEnd = preturn () .>> e
let part = spaces >>. (manyTill $ noneOf d $ (attempt (preturn () .>> pstring d) <|> pEnd)) |>> str
in part .>>. part
let delimited3 firstDelimiter secondDelimiter thirdDelimiter endMarker =
delimited firstDelimiter endMarker
.>>. opt (many (delimited secondDelimiter endMarker
>>. delimited thirdDelimiter endMarker))
// TODO: This is the parser I'm asking about.
let pHeader =
let includesBoundary s = undefined
let setBoundary b = { Boundary=b }
in delimited3 ":" ";" "=" blankField
|>> makeHeader
>>. fun stream -> if includesBoundary // How do I access the output from makeHeader here?
then stream.UserState <- setBoundary b // I need b to be read from the output of makeHeader.
Reply ()
else Reply ()
let pHeaders = manyTill pHeader $ attempt (preturn () .>> blankField)
// N.b. This is the mess I'm currently wrestling with. It does not compile, and is
// not sound yet.
let rec pContent boundary =
match boundary with
| "" -> // Content is text.
let line = restOfLine false
in pipe2 pHeaders (manyTill line $ attempt (preturn () .>> blankField))
$ fun h c -> { headers=h
; content=Content $ System.String.Join (System.Environment.NewLine,c) }
| _ -> // Content contains boundaries.
let b = "--"+boundary
let p = pipe2 pHeaders (pContent b) $ fun h c -> { headers=h; content=c }
in skipString b >>. manyTill p (attempt (preturn () .>> blankField))
let pStream = runP (pipe2 pHeaders pContent $ fun h c -> { headers=h; content=c })
type MParser (s:Stream) =
let r = P.pStream s
let findHeader name =
match r.headers |> List.tryFind (fun h -> h.name.ToLower() = name) with
| Some h -> h.value
| None -> ""
member p.Boundary =
let isBoundary ((s:string),_) = s.ToLower() = "boundary"
let header = r.headers
|> List.tryFind (fun h -> if h.addl.IsSome
then h.addl.Value |> List.exists isBoundary
else false)
in match header with
| Some h -> h.addl.Value |> List.find isBoundary |> snd
| None -> ""
member p.ContentID = findHeader "content-id"
member p.ContentLocation = findHeader "content-location"
member p.ContentSubtype = findHeader "type"
member p.ContentTransferEncoding = findHeader "content-transfer-encoding"
member p.ContentType = findHeader "content-type"
member p.Content = r.content
member p.Headers = r.headers
member p.MessageID = findHeader "message-id"
member p.MimeVersion = findHeader "mime-version"
A truncated example of the POST I am trying to parse follows:
content-type: Multipart/related; boundary="RN-Http-Body-Boundary"; type="multipart/related"
--RN-Http-Body-Boundary
Message-ID: <25845033.1160080657073.JavaMail.webmethods#exshaw>
Mime-Version: 1.0
Content-Type: multipart/related; type="application/xml";
boundary="----=_Part_235_11184805.1160080657052"
------=_Part_235_11184805.1160080657052
Content-Type: Application/XML
Content-Transfer-Encoding: binary
Content-Location: RN-Preamble
Content-ID: <1430586.1160080657050.JavaMail.webmethods#exshaw>
XML document begins here...
So basically, what you want to do in pHeader is to use the parser as a monad, rather than an applicative. Based on your code style you come from Haskell so I'll assume you know these words. Something like this then:
let pHeader =
let includesBoundary s = undefined
let setBoundary b = { Boundary=b }
in delimited3 ":" ";" "=" blankField
|>> makeHeader
>>= fun header stream ->
if includesBoundary header
then let b = undefined // some expression including header, if I understood correctly
stream.UserState <- setBoundary b
Reply ()
else Reply ()
Or you can write it in a computation expression (which would correspond to do-notation in Haskell):
let pHeader =
let includesBoundary s = undefined
let setBoundary b = { Boundary=b }
parse {
let! header =
delimited3 ":" ";" "=" blankField
|>> makeHeader
return! fun stream ->
if includesBoundary header
then let b = undefined // some expression including header, if I understood correctly
stream.UserState <- setBoundary b
Reply ()
else Reply ()
}

Resources