Use FParsec to parse a self-describing input - f#

I'm using FParsec to parse an input that describes its own format. For example, consider this input:
int,str,int:4,'hello',3
The first part of the input (before the colon) describes the format of the second part of the input. In this case, the format is int, str, int, which means that the actual data consists of three comma-separated values of the given types, so the result should be 4, "hello", 3.
What is the best way to parse something like this with FParsec?
I've pasted my best effort below, but I'm not happy with it. Is there a better way to do this that is cleaner, less stateful, and less reliant on the parse monad? I think this depends on smarter management of UserState, but I don't know how to do it. Thanks.
open FParsec
type State = { Formats : string[]; Index : int32 }
with static member Default = { Formats = [||]; Index = 0 }
type Value =
| Integer of int
| String of string
let parseFormat : Parser<_, State> =
parse {
let! formats =
sepBy
(pstring "int" <|> pstring "str")
(skipString ",")
|>> Array.ofList
do! updateUserState (fun state -> { state with Formats = formats })
}
let parseValue format =
match format with
| "int" -> pint32 |>> Integer
| "str" ->
between
(skipString "'")
(skipString "'")
(manySatisfy (fun c -> c <> '\''))
|>> String
| _ -> failwith "Unexpected"
let parseValueByState =
parse {
let! state = getUserState
let format = state.Formats.[state.Index]
do! setUserState { state with Index = state.Index + 1}
return! parseValue format
}
let parseData =
sepBy
parseValueByState
(skipString ",")
let parse =
parseFormat
>>. skipString ":"
>>. parseData
[<EntryPoint>]
let main argv =
let result = runParserOnString parse State.Default "" "int,str,int:4,'hello',3"
printfn "%A" result
0

There seem to be several problems with the original code, so I took my liberty to rewrite it from scratch.
First, several library functions that may appear useful in other FParsec-related projects:
/// Simple Map
/// usage: let z = Map ["hello" => 1; "bye" => 2]
let (=>) x y = x,y
let makeMap x = new Map<_,_>(x)
/// A handy construct allowing NOT to write lengthy type definitions
/// and also avoid Value Restriction error
type Parser<'t> = Parser<'t, UserState>
/// A list combinator, inspired by FParsec's (>>=) combinator
let (<<+) (p1: Parser<'T list>) (p2: Parser<'T>) =
p1 >>= fun x -> p2 >>= fun y -> preturn (y::x)
/// Runs all parsers listed in the source list;
/// All but the trailing one are also combined with a separator
let allOfSepBy separator parsers : Parser<'T list> =
let rec fold state =
function
| [] -> pzero
| hd::[] -> state <<+ hd
| hd::tl -> fold (state <<+ (hd .>> separator)) tl
fold (preturn []) parsers
|>> List.rev // reverse the list since we appended to the top
Now, the main code. The basic idea is to run parsing in three steps:
Parse out the keys (which are plain ASCII strings)
Map these keys to actual Value parsers
Run these parsers in order
The rest seems to be commented within the code. :)
/// The resulting type
type Output =
| Integer of int
| String of string
/// tag to parser mappings
let mappings =
[
"int" => (pint32 |>> Integer)
"str" => (
manySatisfy (fun c -> c <> '\'')
|> between (skipChar ''') (skipChar ''')
|>> String
)
]
|> makeMap
let myProcess : Parser<Output list> =
let pKeys = // First, we parse out the keys
many1Satisfy isAsciiLower // Parse one key; keys are always ASCII strings
|> sepBy <| (skipChar ',') // many keys separated by comma
.>> (skipChar ':') // all this with trailing semicolon
let pValues = fun keys ->
keys // take the keys list
|> List.map // find the required Value parser
// (NO ERROR CHECK for bad keys)
(fun p -> Map.find p mappings)
|> allOfSepBy (skipChar ',') // they must run in order, comma-separated
pKeys >>= pValues
Run on string: int,int,str,int,str:4,42,'hello',3,'foobar'
Returned: [Integer 4; Integer 42; String "hello"; Integer 3; String "foobar"]

#bytebuster beat me to it but I still post my solution. The technique is similar to #bytebuster.
Thanks for an interesting question.
In compilers I believe the preferred technique is to parse the text into an AST and on that run a type-checker. For this example a potentially simpler technique would be that parsing the type definitions returns a set of parsers for the values. These parsers are then applied on the rest of the string.
open FParsec
type Value =
| Integer of int
| String of string
type ValueParser = Parser<Value, unit>
let parseIntValue : Parser<Value, unit> =
pint32 |>> Integer
let parseStringValue : Parser<Value, unit> =
between
(skipChar '\'')
(skipChar '\'')
(manySatisfy (fun c -> c <> '\''))
<?> "string"
|>> String
let parseValueParser : Parser<ValueParser, unit> =
choice
[
skipString "int" >>% parseIntValue
skipString "str" >>% parseStringValue
]
let parseValueParsers : Parser<ValueParser list, unit> =
sepBy1
parseValueParser
(skipChar ',')
// Runs a list of parsers 'ps' separated by 'sep' parser
let sepByList (ps : Parser<'T, unit> list) (sep : Parser<unit, unit>) : Parser<'T list, unit> =
let rec loop adjust ps =
match ps with
| [] -> preturn []
| h::t ->
adjust h >>= fun v -> loop (fun pp -> sep >>. pp) t >>= fun vs -> preturn (v::vs)
loop id ps
let parseLine : Parser<Value list, unit> =
parseValueParsers .>> skipChar ':' >>= (fun vps -> sepByList vps (skipChar ',')) .>> eof
[<EntryPoint>]
let main argv =
let s = "int,str,int:4,'hello',3"
let r = run parseLine s
printfn "%A" r
0
Parsing int,str,int:4,'hello',3 yields Success: [Integer 4; String "hello";Integer 3].
Parsing int,str,str:4,'hello',3 (incorrect) yields:
Failure:
Error in Ln: 1 Col: 23
int,str,str:4,'hello',3
^
Expecting: string

I rewrote #FuleSnabel's sepByList as follows to help me understand it better. Does this look right?
let sepByList (parsers : Parser<'T, unit> list) (sep : Parser<unit, unit>) : Parser<'T list, unit> =
let rec loop adjust parsers =
parse {
match parsers with
| [] -> return []
| parser :: tail ->
let! value = adjust parser
let! values = loop (fun parser -> sep >>. parser) tail
return value :: values
}
loop id parsers

Related

How to parse a recursive left syntax rule with FParsec?

I usually use FParsec for LL grammars, but sometimes it happens that in a whole grammar only one element requires left recursive parsing (so the grammar is no longer LL). Currently I have such a situation, I have a large LL grammar implemented with FParsec, but a small grammar element is bothering me because it obviously cannot be parsed correctly.
The syntax element in question is an access to an array index à la F#, e.g. myArray.[index] where myArray can be any expression and index can be any expression too. It turns out that my function calls use square brackets, not parentheses, and my identifiers can be qualified with dots.
An example of correct syntax for an expression is: std.fold[fn, f[myArray.[0]], std.tail[myArray]].
The .[] syntax element is obviously left recursive, but perhaps there is a trick that allows me to parse it anyway? My minimal code is as follows:
open FParsec
type Name = string list
type Expr =
(* foo, Example.Bar.fizz *)
| Variable of Name
(* 9, 17, -1 *)
| Integer of int
(* foo[3, 2], Std.sqrt[2] *)
| FunCall of Name * Expr list
(* (a + b), (a + (1 - c)) *)
| Parens of Expr
(* myArray.[0], table.[index - 1] *)
| ArrayAccess of Expr * Expr
(* a + b *)
| Addition of Expr * Expr
let opp =
new OperatorPrecedenceParser<Expr, _, _>()
let pExpr = opp.ExpressionParser
let pName =
let id =
identifier (IdentifierOptions(isAsciiIdStart = isAsciiLetter, isAsciiIdContinue = isAsciiLetter))
sepBy1 id (skipChar '.')
let pVariable = pName |>> Variable
let pInt = pint32 |>> Integer
let pFunCall =
pipe4
pName
(spaces >>. skipChar '[')
(sepBy (spaces >>. pExpr) (skipChar ','))
(spaces >>. skipChar ']')
(fun name _ args _ -> FunCall(name, args))
let pArrayAccess =
pipe5
pExpr
(spaces >>. skipChar '.')
(spaces >>. skipChar '[')
(spaces >>. pExpr)
(spaces >>. skipChar ']')
(fun expr _ _ index _ -> ArrayAccess(expr, index))
let pParens =
between (skipChar '(') (skipChar ')') (spaces >>. pExpr)
opp.TermParser <-
choice [ attempt pFunCall
pVariable
pArrayAccess
pInt
pParens ]
.>> spaces
let addInfixOperator str prec assoc mapping =
opp.AddOperator
<| InfixOperator(str, spaces, prec, assoc, (), (fun _ leftTerm rightTerm -> mapping leftTerm rightTerm))
addInfixOperator "+" 6 Associativity.Left (fun a b -> Addition(a, b))
let startParser = runParserOnString (pExpr .>> eof) () ""
printfn "%A" <| startParser "std.fold[fn, f[myArray.[0]], std.tail[myArray]]"
One way to do this is as follows: instead of making a list of parsing choices that also lists pArrayAccess like above, which will at some point cause an infinite loop, one can modify pExpr to parse the grammar element in question as an optional element following an expression:
let pExpr =
parse {
let! exp = opp.ExpressionParser
let pArrayAccess =
between (skipString ".[") (skipString "]") opp.ExpressionParser
match! opt pArrayAccess with
| None -> return exp
| Some index -> return ArrayAccess(exp, index)
}
After testing, it turns out that this works very well if the following two conditions are not met:
The contents of the square brackets must not contain access to another array ;
An array cannot be accessed a second time in succession (my2DArray.[x].[y]).
This restricts usage somewhat. How can I get away with this? Is there a way to do this or do I have to change the grammar?
Finally, a solution to this problem is quite simple: just expect a list of array access. If the list is empty, then return the initial expression, otherwise fold over all the array accesses and return the result. Here is the implementation:
let rec pExpr =
parse {
let! exp = opp.ExpressionParser
let pArrayAccess =
between (skipString ".[") (skipString "]") pExpr
match! many pArrayAccess with
| [] -> return exp
| xs -> return List.fold
(fun acc curr -> ArrayAccess(acc, curr)) exp xs
}
This way of doing things meets my needs, so I'd be happy with it, if anyone passes by and wants something more general and not applicable with the proposed solution, then I refer to #Martin Freedman comment, using createParserForwardedToRef().

fparsec - limit number of characters that a parser is applied to

I have a problem where during the parsing of a stream I get to point where the next N characters need to be parsed by applying a specfic parser multiple times (in sequence).
(stripped down toy) Example:
17<tag><anothertag><a42...
^
|- I'm here
Let's say the 17 indicates that the next N=17 characters make up tags, so I need to repetetively apply my "tagParser" but stop after 17 chars and not consume the rest even if it looks like a tag because that has a different meaning and will be parsed by another parser.
I cannot use many or many1 because that would eat the stream beyond those N characters.
Nor can I use parray because I do not know how many successful applications of that parser are there within the N characters.
I was looking into manyMinMaxSatisfy but could not figure out how to make use of it in this case.
Is there a way to cut N chars of a stream and feed them to some parser? Or is there a way to invoke many applications but up to N chars?
Thanks.
You can use getPosition to make sure you don't go past the specified number of characters. I threw this together (using F# 6) and it seems to work, although simpler/faster solutions may be possible:
let manyLimit nChars p =
parse {
let! startPos = getPosition
let rec loop values =
parse {
let! curPos = getPosition
let nRemain = (startPos.Index + nChars) - curPos.Index
if nRemain = 0 then
return values
elif nRemain > 0 then
let! value = p
return! loop (value :: values)
else
return! fail $"limit exceeded by {-nRemain} chars"
}
let! values = loop []
return values |> List.rev
}
Test code:
let ptag =
between
(skipChar '<')
(skipChar '>')
(manySatisfy (fun c -> c <> '>'))
let parser =
parse {
let! nChars = pint64
let! tags = manyLimit nChars ptag
let! rest = restOfLine true
return tags, rest
}
run parser "17<tag><anothertag><a42..."
|> printfn "%A"
Output is:
Success: (["tag"; "anothertag"], "<a42...")
Quite low-level parser, that operates on raw Reply objects. It reads count of chars, creates substring to feed to tags parser and consumes rest. There's should be an easier way, but I don't have much experience with FParsec
open FParsec
type Tag = Tag of string
let pTag = // parses tag string and constructs 'Tag' object
skipChar '<' >>. many1Satisfy isLetter .>> skipChar '>'
|>> Tag
let pCountPrefixedTags stream =
let count = pint32 stream // read chars count
if count.Status = Ok then
let count = count.Result
// take exactly 'count' chars
let tags = manyMinMaxSatisfy count count (fun _ -> true) stream
if tags.Status = Ok then
// parse substring with tags
let res = run (many1 pTag) tags.Result
match res with
| Success (res, _, _) -> Reply(res)
| Failure (_, error, _) -> Reply(ReplyStatus.Error, error.Messages)
else
Reply(tags.Status, tags.Error)
else
Reply(count.Status, count.Error)
let consumeStream =
many1Satisfy (fun _ -> true)
run (pCountPrefixedTags .>>. consumeStream) "17<tag><anothertag><notTag..."
|> printfn "%A" // Success: ([Tag "tag"; Tag "anothertag"], "<notTag...")
You also can do this without going down to stream level.
open FParsec
let ptag =
between
(skipChar '<')
(skipChar '>')
(manySatisfy (fun c -> c <> '>'))
let tagsFromChars (l: char[]) =
let s = new System.String(l)
match run (many ptag) s with
| Success(result, _, _) -> result
| Failure(errorMsg, _, _) -> []
let parser =
parse {
let! nChars = pint32
let! tags = parray nChars anyChar |>> tagsFromChars
let! rest = restOfLine true
return tags, rest
}
run parser "17<tag><anothertag><a42..."
|> printfn "%A"

Parsing the signature of a function - Error with the arrow type - FParsec + indentation

I had already asked a question about how to parse the arrow type, this is not a duplicate, but rather an adaptation with the indentation based syntax.
Indeed, I would like to be able to analyze a syntax close to that of the ML family languages. I also introduced the syntax of the type signature of a function in Haskell, so this:
myFunction :: atype
My parser works very well for all kinds of signature types, except the arrow type when it is "alone":
foo :: a // ok
foo :: [a] // ok
foo :: (a, a) // ok
foo :: [a -> a] // ok
foo :: (a -> a, a) // ok
foo :: a -> a // error
Same for the creation of functions (to make it simple, I just expected a number as a value):
foo: a = 0 // ok
foo: [a] = 0 // ok
foo: (a, a) = 0 // ok
foo: [a -> a] = 0 // ok
foo: (a -> a, a) = 0 // ok
foo: a -> a = 0 // error
Without the indentation, all these cases work a priori.
I tried a module to parse the indentation other than the FParsec wiki, just to try and evaluate a little. It comes from there, and here is the necessary and sufficient module code for the question:
module IndentParser =
type Indentation =
| Fail
| Any
| Greater of Position
| Exact of Position
| AtLeast of Position
| StartIndent of Position
with
member this.Position = match this with
| Any | Fail -> None
| Greater p -> Some p
| Exact p -> Some p
| AtLeast p -> Some p
| StartIndent p -> Some p
type IndentState<'T> = { Indent : Indentation; UserState : 'T }
type CharStream<'T> = FParsec.CharStream<IndentState<'T>>
type IndentParser<'T, 'UserState> = Parser<'T, IndentState<'UserState>>
let indentState u = {Indent = Any; UserState = u}
let runParser p u s = runParserOnString p (indentState u) "" s
let runParserOnFile p u path = runParserOnFile p (indentState u) path System.Text.Encoding.UTF8
let getIndentation : IndentParser<_,_> =
fun stream -> match stream.UserState with
| {Indent = i} -> Reply i
let getUserState : IndentParser<_,_> =
fun stream -> match stream.UserState with
| {UserState = u} -> Reply u
let putIndentation newi : IndentParser<unit, _> =
fun stream ->
stream.UserState <- {stream.UserState with Indent = newi}
Reply(Unchecked.defaultof<unit>)
let failf fmt = fail << sprintf fmt
let acceptable i (pos : Position) =
match i with
| Any _ -> true
| Fail -> false
| Greater bp -> bp.Column < pos.Column
| Exact ep -> ep.Column = pos.Column
| AtLeast ap -> ap.Column <= pos.Column
| StartIndent _ -> true
let tokeniser p = parse {
let! pos = getPosition
let! i = getIndentation
if acceptable i pos then return! p
else return! failf "incorrect indentation at %A" pos
}
let indented<'a,'u> i (p : Parser<'a,_>) : IndentParser<_, 'u> = parse {
do! putIndentation i
do! spaces
return! tokeniser p
}
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is on the same line as the defined position (`pos`).
let exact<'a,'u> pos p: IndentParser<'a, 'u> = indented (Exact pos) p
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is further away than the defined position (`pos`).
let greater<'a,'u> pos p: IndentParser<'a, 'u> = indented (Greater pos) p
/// Allows to check if the position of the parser currently being analyzed (`p`)
/// is on the same OR line further than the defined position (`pos`).
let atLeast<'a,'u> pos p: IndentParser<'a, 'u> = indented (AtLeast pos) p
/// Simply check if the parser (`p`) exists, regardless of its position in the text to be analyzed.
let any<'a,'u> pos p: IndentParser<'a, 'u> = indented Any p
let newline<'u> : IndentParser<unit, 'u> = many (skipAnyOf " \t" <?> "whitespace") >>. newline |>> ignore
let rec blockOf p = parse {
do! spaces
let! pos = getPosition
let! x = exact pos p
let! xs = attempt (exact pos <| blockOf p) <|> preturn []
return x::xs
}
Now, here is the code I'm trying to fix for the problem I encountered:
module Parser =
open IndentParser
type Identifier = string
type Type =
| Typename of Identifier
| Tuple of Type list
| List of Type
| Arrow of Type * Type
| Infered
type Expression =
| Let of Identifier * Type * int
| Signature of Identifier * Type
type Program = Program of Expression list
// Utils -----------------------------------------------------------------
let private ws = spaces
/// All symbols granted for the "opws" parser
let private allowedSymbols =
['!'; '#'; '#'; '$'; '%'; '+'; '&'; '*'; '('; ')'; '-'; '+'; '='; '?'; '/'; '>'; '<'; '|']
/// Parse an operator and white spaces around it: `ws >>. p .>> ws`
let inline private opws str =
ws >>.
(tokeniser (pstring str >>?
(nextCharSatisfiesNot
(isAnyOf (allowedSymbols # ['"'; '''])) <?> str))) .>> ws
let private identifier =
(many1Satisfy2L isLetter
(fun c -> isLetter c || isDigit c) "identifier")
// Types -----------------------------------------------------------------
let rec typename = parse {
let! name = ws >>. identifier
return Type.Typename name
}
and tuple_type = parse {
let! types = between (opws "(") (opws ")") (sepBy (ws >>. type') (opws ","))
return Type.Tuple types
}
and list_type = parse {
let! ty = between (opws "[") (opws "]") type'
return Type.List ty
}
and arrow_type =
chainr1 (typename <|> tuple_type <|> list_type) (opws "->" >>% fun t1 t2 -> Arrow(t1, t2))
and type' =
attempt arrow_type <|>
attempt typename <|>
attempt tuple_type <|>
attempt list_type
// Expressions -----------------------------------------------------------------
let rec private let' = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws ":")
let! ty = greater pos type'
do! greater pos (opws "=")
let! value = greater pos pint32
return Expression.Let(id, ty, value)
}
and private signature = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws "::")
let! ty = greater pos type'
return Expression.Signature(id, ty)
}
and private expression =
attempt let'
and private expressions = blockOf expression <?> "expressions"
let private document = ws >>. expressions .>> ws .>> eof |>> Program
let private testType = ws >>. type' .>> ws .>> eof
let rec parse code =
runParser document () code
|> printfn "%A"
open Parser
parse #"
foo :: a -> a
"
Here is the error message obtained:
There is no reference to indentation in the error message, that's what troubles also, because if I implement an identical parser, except for indentation parsing, it works.
Could you put me on the right way?
EDIT
Here is the "fixed" code (the use of the function signature parser was missing + removal of unnecessary attempt):
open FParsec
// module IndentParser
module Parser =
open IndentParser
type Identifier = string
type Type =
| Typename of Identifier
| Tuple of Type list
| List of Type
| Arrow of Type * Type
| Infered
type Expression =
| Let of Identifier * Type * int
| Signature of Identifier * Type
type Program = Program of Expression list
// Utils -----------------------------------------------------------------
let private ws = spaces
/// All symbols granted for the "opws" parser
let private allowedSymbols =
['!'; '#'; '#'; '$'; '%'; '+'; '&'; '*'; '('; ')'; '-'; '+'; '='; '?'; '/'; '>'; '<'; '|']
/// Parse an operator and white spaces around it: `ws >>. p .>> ws`
let inline private opws str =
ws >>.
(tokeniser (pstring str >>?
(nextCharSatisfiesNot
(isAnyOf (allowedSymbols # ['"'; '''])) <?> str))) .>> ws
let private identifier =
(many1Satisfy2L isLetter
(fun c -> isLetter c || isDigit c) "identifier")
// Types -----------------------------------------------------------------
let rec typename = parse {
let! name = ws >>. identifier
return Type.Typename name
}
and tuple_type = parse {
let! types = between (opws "(") (opws ")") (sepBy (ws >>. type') (opws ","))
return Type.Tuple types
}
and list_type = parse {
let! ty = between (opws "[") (opws "]") type'
return Type.List ty
}
and arrow_type =
chainr1 (typename <|> tuple_type <|> list_type) (opws "->" >>% fun t1 t2 -> Arrow(t1, t2))
and type' =
attempt arrow_type <|>
typename <|>
tuple_type <|>
list_type
// Expressions -----------------------------------------------------------------
let rec private let' = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws ":")
let! ty = greater pos type'
do! greater pos (opws "=")
let! value = greater pos pint32
return Expression.Let(id, ty, value)
}
and private signature = parse {
let! pos = getPosition
let! id = exact pos identifier
do! greater pos (opws "::")
let! ty = greater pos type'
return Expression.Signature(id, ty)
}
and private expression =
attempt let' <|>
signature
and private expressions = blockOf expression <?> "expressions"
let private document = ws >>. expressions .>> ws .>> eof |>> Program
let private testType = ws >>. type' .>> ws .>> eof
let rec parse code =
runParser document () code
|> printfn "%A"
open Parser
System.Console.Clear()
parse #"
foo :: a -> a
"
So, here are the new error messages:
and
At the moment, your code is failing on the :: signature because you haven't actually used your signature parser anywhere. You have defined expression as attempt let', but I think you meant to write attempt signature <|> attempt let'. That is why your test is failing on the second colon of ::, because it's matching the single colon of a let' and then not expecting the second colon.
Also, I think your chaining multiple attempt combinators together like attempt a <|> attempt b <|> attempt c is going to cause you problems somewhere, and that you should remove the final attempt, e.g., attempt a <|> attempt b <|> c. If you use attempt in all the possible choices, you'll end up with a parser that can succeed by parsing nothing, which is often not what you intended.
Update: I think I've found the cause and the solution.
Summary: In your opws parser, replace the line ws >>. with ws >>?.
Explanation: In all the sepBy variants (and chainr1 is a sepBy variant), FParsec expects that the separator parser will either succeed, or will fail without consuming input. (If the separator fails after consuming input, FParsec considers the entire sepBy-family parser to have failed in its entirety.) But your opws parser will consume whitespace, then fail if it doesn't find a correct operator. So when your arrow_type parser parses the string a -> a followed by a newline, the arrow after the first a is correctly matched, then it sees the second a, and then it tries to find another arrow. Since what follows next is at least one whitespace character (newlines count as whitespace), the opws "->" parser ends up consuming some input before it fails. (It fails because after that whitespace is the end of the file, not another -> token). This makes the chainr1 combinator fail, so arrow_type fails and your a -> a parser ends up being parsed as a single type a. (At which point the arrow is now unexpected).
By using >>? in your definition of opws, you ensure that if the second part of the parser fails, it will backtrack to before it matched any whitespace. That ensures that the separator parser will fail without matching input and without advancing the parse position in the character stream. Therefore, the chainr1 parser succeeds after parsing a -> a and you get the expected results.

Parsing an ML-like syntax based on indentation, and everything considered to be an instruction/expression

NOTE: Not long ago, I had already asked a similar question. This is not a duplication, but the clarifications to be requested did not fall within the scope of the subject itself. I therefore allow myself to open another position dealing with the analysis of an ML-like syntax based on indentation, and considering everything as an instruction / expression.
For example:
"Hello" is an expression,
let foo = 2 + 1 is an instruction using an expression (2 + 1),
print foo is an instruction, ...
In short, a syntax and semantics that is quite modular and dynamic. Like F#, or OCaml.
To do this, I use F#, with the API (available on nuget) FParsec. The FParsec wiki provides an example of a syntax based on indentation, so I have taken it up again. The module in the code below used is IndentationParserWithoutBacktracking.
The example code to be parsed uses an elementary indentation, not mixing "literal" and "instructions/expressions":
loop i 1 10
loop k 1 10
print k
print i
print j
A simple code, and without context (but this is not important at the moment).
My implementation allows codes such as:
let foo = a + b
let foo =
let a = 9
let b = 1
a + b
let foo = 7
let foo =
loop i 1 10
print i
For example. (The loop and print are there just for the tests...)
The problem I have been having for a long week now, and that I can't solve, is the fact that the indentation module asks me every time an instruction is expected in a parser for a new line... Here is a screenshot:
This applies to all the examples mentioned above. I don't really understand the problem, and therefore don't know how to solve it.
Here is the code tested for this question, it meets the minimum and functional code criteria, however, FParsec must be used:
open FParsec
// This module come from 'https://github.com/stephan-tolksdorf/fparsec/wiki/Parsing-indentation-based-syntax-with-FParsec'
// I used the second module: 'IndentationParserWithoutBacktracking'
module IndentationParserWithoutBacktracking =
let tabStopDistance = 8
type LastParsedIndentation() =
[<DefaultValue>]
val mutable Value: int32
[<DefaultValue>]
val mutable EndIndex: int64
type UserState =
{Indentation: int
// We put LastParsedIndentation into the UserState so that we
// can conveniently use a separate instance for each stream.
// The members of the LastParsedIndentation instance will be mutated
// directly and hence won't be affected by any stream backtracking.
LastParsedIndentation: LastParsedIndentation}
with
static member Create() = {Indentation = -1
LastParsedIndentation = LastParsedIndentation(EndIndex = -1L)}
type CharStream = CharStream<UserState>
type Parser<'t> = Parser<'t, UserState>
// If this function is called at the same index in the stream
// where the function previously stopped, then the previously
// returned indentation will be returned again.
// This way we can avoid backtracking at the end of indented blocks.
let skipIndentation (stream: CharStream) =
let lastParsedIndentation = stream.UserState.LastParsedIndentation
if lastParsedIndentation.EndIndex = stream.Index then
lastParsedIndentation.Value
else
let mutable indentation = stream.SkipNewlineThenWhitespace(tabStopDistance, false)
lastParsedIndentation.EndIndex <- stream.Index
lastParsedIndentation.Value <- indentation
indentation
let indentedMany1 (p: Parser<'t>) label : Parser<'t list> =
fun stream ->
let oldIndentation = stream.UserState.Indentation
let indentation = skipIndentation stream
if indentation <= oldIndentation then
Reply(Error, expected (if indentation < 0 then "newline" else "indented " + label))
else
stream.UserState <- {stream.UserState with Indentation = indentation}
let results = ResizeArray()
let mutable stateTag = stream.StateTag
let mutable reply = p stream // parse the first element
let mutable newIndentation = 0
while reply.Status = Ok
&& (results.Add(reply.Result)
newIndentation <- skipIndentation stream
newIndentation = indentation)
do
stateTag <- stream.StateTag
reply <- p stream
if reply.Status = Ok
|| (stream.IsEndOfStream && results.Count > 0 && stream.StateTag = stateTag)
then
if newIndentation < indentation || stream.IsEndOfStream then
stream.UserState <- {stream.UserState with Indentation = oldIndentation}
Reply(List.ofSeq results)
else
Reply(Error, messageError "wrong indentation")
else // p failed
Reply(reply.Status, reply.Error)
open IndentationParserWithoutBacktracking
let isBlank = fun c -> c = ' ' || c = '\t'
let ws = spaces
let ws1 = skipMany1SatisfyL isBlank "whitespace"
let str s = pstring s .>> ws
let keyword str = pstring str >>? nextCharSatisfiesNot (fun c -> isLetter c || isDigit c) <?> str
// AST
type Identifier = Identifier of string
// A value is just a literal or a data name, called here "Variable"
type Value =
| Int of int | Float of float
| Bool of bool | String of string
| Char of char | Variable of Identifier
// All is an instruction, but there are some differences:
type Instr =
// Arithmetic
| Literal of Value | Infix of Instr * InfixOp * Instr
// Statements (instructions needing another instructions)
| Let of Identifier * Instr list
| Loop of Identifier * Instr * Instr * Instr list
// Other - the "print" function, from the link seen above
| Print of Identifier
and InfixOp =
// Arithmetic
| Sum | Sub | Mul | Div
// Logic
| And | Or | Equal | NotEqual | Greater | Smaller | GreaterEqual | SmallerEqual
// Literals
let numberFormat = NumberLiteralOptions.AllowMinusSign ||| NumberLiteralOptions.AllowFraction |||
NumberLiteralOptions.AllowHexadecimal ||| NumberLiteralOptions.AllowOctal |||
NumberLiteralOptions.AllowBinary
let literal_numeric =
numberLiteral numberFormat "number" |>> fun nl ->
if nl.IsInteger then Literal (Int(int nl.String))
else Literal (Float(float nl.String))
let literal_bool =
(choice [
(stringReturn "true" (Literal (Bool true)))
(stringReturn "false" (Literal (Bool false)))
]
.>> ws) <?> "boolean"
let literal_string =
(between (pstring "\"") (pstring "\"") (manyChars (satisfy (fun c -> c <> '"')))
|>> fun s -> Literal (String s)) <?> "string"
let literal_char =
(between (pstring "'") (pstring "'") (satisfy (fun c -> c <> '''))
|>> fun c -> Literal (Char c)) <?> "character"
let identifier =
(many1Satisfy2L isLetter (fun c -> isLetter c || isDigit c) "identifier"
|>> Identifier) <?> "identifier"
let betweenParentheses p =
(between (str "(") (str ")") p) <?> ""
let variable = identifier |>> fun id -> Literal (Variable id)
let literal = (attempt literal_numeric <|>
attempt literal_bool <|>
attempt literal_char <|>
attempt literal_string <|>
attempt variable)
// Instressions and statements
let pInstrs, pInstrimpl = createParserForwardedToRef()
// `ploop` is located here to force `pInstrs` to be of the type `Instr list`, `ploop` requesting an instression list.
let ploop =
pipe4
(keyword "loop" >>. ws1 >>. identifier)
(ws1 >>. literal)
(ws1 >>. literal)
(pInstrs)
(fun id min max stmts -> Loop(id, min, max, stmts))
// `singlepInstr` allows to use only one Instression, used just after.
let singlepInstr =
pInstrs |>> fun ex -> ex.Head
let term =
(ws >>. singlepInstr .>> ws) <|>
(betweenParentheses (ws >>. singlepInstr)) <|>
(ws >>. literal .>> ws) <|>
(betweenParentheses (ws >>. literal))
let infixOperator (p: OperatorPrecedenceParser<_, _, _>) op prec map =
p.AddOperator(InfixOperator(op, ws, prec, Associativity.Left, map))
let ops =
// Arithmetic
[ "+"; "-"; "*"; "/"; "%" ] #
// Logical
[ "&&"; "||"; "=="; "!="; ">"; "<"; ">="; "<=" ]
let opCorrespondance op =
match op with
// Arithmetic operators
| "+" -> Sum | "-" -> Sub
| "*" -> Mul | "/" -> Div
// Logical operators
| "&&" -> And | "||" -> Or
| "==" -> Equal | "!=" -> NotEqual
| ">" -> Greater | "<" -> Smaller
| ">=" -> GreaterEqual | "<=" -> SmallerEqual
| _ -> failwith ("Unknown operator: " + op)
let opParser = new OperatorPrecedenceParser<Instr, unit, UserState>()
for op in ops do
infixOperator opParser op 1 (fun x y -> Infix(x, opCorrespondance op, y))
opParser.TermParser <- term
// Statements
(*
- let:
let <identifier> = <instruction(s) / value>
- print:
print <identifier>
- loop:
loop <identifier> <literal> <literal> <indented statements>
*)
let plet =
pipe2
(keyword "let" >>. ws1 >>. identifier)
(ws >>. str "=" >>. ws >>. pInstrs)
(fun id exp -> Let(id, exp))
let print =
keyword "print" >>. ws1 >>. identifier
|>> Print
let instruction =
print <|> ploop <|> plet <|>
opParser.ExpressionParser <|>
literal
pInstrimpl := indentedMany1 instruction "instruction"
let document = pInstrs .>> spaces .>> eof
let test str =
match runParserOnString document (UserState.Create()) "" str with
| Success(result, _, _) -> printfn "%A" result
| Failure(errorMsg, _, _) -> printfn "%s" errorMsg
System.Console.Clear()
let code = test #"
let foo = a + b
"
I would like to understand first of all why it doesn't work, but also to be able to find a solution to my problem, and that this solution can be extended to the potential syntax additions of the parser.
Awaiting a salutary answer, thank you.
In order to understand why your parser doesn't work, you need to isolate the issues.
If I understand you correctly, you want your let parser to support either a single instruction on the same line or indented instructions on subsequent lines, e.g:
let x = instruction
let b =
instruction
instruction
If you can't get your existing implementation to work, I'd recommend going back to the implementation on the Wiki and trying to just add support for the let statement.
For example, I made the Wiki parser accept simple let statements with the following modifications:
type Statement = Loop of Identifier * int * int * Statement list
| Print of Identifier
| Let of Identifier * Statement list
let ws = skipManySatisfy isBlank
let str s = pstring s .>> ws
let statement, statementRef = createParserForwardedToRef()
let indentedStatements = indentedMany1 statement "statement"
let plet = keyword "let" >>. pipe2 (ws1 >>. identifier)
(ws >>. str "=" >>. ws
>>. (indentedStatements
<|> (statement |>> fun s -> [s])))
(fun id exp -> Let(id, exp))
statementRef := print <|> loop <|> plet
Note that in the modified version statement is now the parser forwarded to a ref cell, not indentedStatements.
Note also that ws is not implemented with spaces, like in your parser. This is important because spaces also consumes newlines, which would prevent the indentedMany1 from seeing the newline and properly calculating the indentation.
The reason your parser produced an "Expecting: newline" error is that indentedMany1 needs a newline at the beginning of the indented sequence in order to be able to calculate the indentation. You would have to modify the implementation of indentedMany1 if you wanted to support e.g. the following indentation pattern:
let x = instruction
instruction
instruction

fparsec - combinator "many" complains and... why not parse block comments like this?

This question, first off, is not a duplicate of my question.
Actually I have 3 questions.
In the code below, I try to create a parser which parses possibly nested multiline block comments. In contrast to the cited other question, I try to solve the problem in a straightforward way without any recursive functions (see the accepted answer to the other post).
The first problem I ran into was that skipManyTill parser of FParsec also consumes the end parser from the stream. So I created skipManyTillEx (Ex for 'excluding endp' ;) ). The skipManyTillEx seems to work - at least for the one test case I also added to the fsx script.
Yet in the code, shown, now I get the "The combinator 'many' was applied to a parser that succeeds without consuming..." error. My theory is, that the commentContent parser is the line which produces this error.
Here, my questions:
Is there any reason, why the approach I have chosen cannot work? The solution in 1, which, unfortunately does not seem to compile on my system uses a recursive low level parser for (nested) multiline comments.
Can anyone see a problem with the way I implemented skipManyTillEx? The way I implemented it differs to some degree from the way skipManyTill is implemented, mostly in the aspect of how to control the parsing flow. In original skipManyTill, the Reply<_> of p and endp is tracked, along with the stream.StateTag. In my implementation, in contrast I did not see the need to use stream.StateTag, solely relying on the Reply<_> status code. In case of an unsuccessful parse, skipManyTillEx backtracks to the streams initial state and reports an error. Could possibly the backtracking code cause the 'many' error? What would I have to do instead?
(and that is the main question) - Does anyone see, how to fix the parser such, that this "many ... " error message goes away?
Here is the code:
#r #"C:\hgprojects\fparsec\Build\VS11\bin\Debug\FParsecCS.dll"
#r #"C:\hgprojects\fparsec\Build\VS11\bin\Debug\FParsec.dll"
open FParsec
let testParser p input =
match run p input with
| Success(result, _, _) -> printfn "Success: %A" result
| Failure(errorMsg, _, _) -> printfn "Failure %s" errorMsg
input
let Show (s : string) : string =
printfn "%s" s
s
let test p i =
i |> Show |> testParser p |> ignore
////////////////////////////////////////////////////////////////////////////////////////////////
let skipManyTillEx (p : Parser<_,_>) (endp : Parser<_,_>) : Parser<unit,unit> =
fun stream ->
let tryParse (p : Parser<_,_>) (stm : CharStream<unit>) : bool =
let spre = stm.State
let reply = p stream
match reply.Status with
| ReplyStatus.Ok ->
stream.BacktrackTo spre
true
| _ ->
stream.BacktrackTo spre
false
let initialState = stream.State
let mutable preply = preturn () stream
let mutable looping = true
while (not (tryParse endp stream)) && looping do
preply <- p stream
match preply.Status with
| ReplyStatus.Ok -> ()
| _ -> looping <- false
match preply.Status with
| ReplyStatus.Ok -> preply
| _ ->
let myReply = Reply(Error, mergeErrors preply.Error (messageError "skipManyTillEx failed") )
stream.BacktrackTo initialState
myReply
let ublockComment, ublockCommentImpl = createParserForwardedToRef()
let bcopenTag = "/*"
let bccloseTag = "*/"
let pbcopen = pstring bcopenTag
let pbcclose = pstring bccloseTag
let ignoreCommentContent : Parser<unit,unit> = skipManyTillEx (skipAnyChar) (choice [pbcopen; pbcclose] |>> fun x -> ())
let ignoreSubComment : Parser<unit,unit> = ublockComment
let commentContent : Parser<unit,unit> = skipMany (choice [ignoreCommentContent; ignoreSubComment])
do ublockCommentImpl := between (pbcopen) (pbcclose) (commentContent) |>> fun c -> ()
do test (skipManyTillEx (pchar 'a' |>> fun c -> ()) (pchar 'b') >>. (restOfLine true)) "aaaabcccc"
// do test ublockComment "/**/"
//do test ublockComment "/* This is a comment \n With multiple lines. */"
do test ublockComment "/* Bla bla bla /* nested bla bla */ more outer bla bla */"
let's take a look at your questions...
1. Is there any reason, why the approach I have chosen cannot work?
Your approach can definitely work, you just have to weed out the bugs.
2. Can anyone see a problem with the way I implemented skipManyTillEx?
No. Your implementation looks OK. It's just the combination of skipMany and skipManyTillEx that's the problem.
let ignoreCommentContent : Parser<unit,unit> = skipManyTillEx (skipAnyChar) (choice [pbcopen; pbcclose] |>> fun x -> ())
let commentContent : Parser<unit,unit> = skipMany (choice [ignoreCommentContent; ignoreSubComment])
skipMany in commentContent runs until ignoreCommentContent and ignoreSubComment both fail. But ignoreCommentContent is implemented using your skipManyTillEx, which is implemented in a way that it could succeed without consuming input. This means that the outer skipMany would not be able to determine when to stop because if no input is consumed, it doesn't know whether a subsequent parser has failed or simply didn't consume anything.
This is why it's required that every parser below a many parser has to consume input. Your skipManyTillEx might not, that's what the error message is trying to tell you.
To fix it, you have to implement a skipMany1TillEx, that consumes at least one element itself.
3. Does anyone see, how to fix the parser such, that this "many ... " error message goes away?
How about this approach?
open FParsec
open System
/// Type abbreviation for parsers without user state.
type Parser<'a> = Parser<'a, Unit>
/// Skips C-style multiline comment /*...*/ with arbitrary nesting depth.
let (comment : Parser<_>), commentRef = createParserForwardedToRef ()
/// Skips any character not beginning of comment end marker */.
let skipCommentChar : Parser<_> =
notFollowedBy (skipString "*/") >>. skipAnyChar
/// Skips anx mix of nested comments or comment characters.
let commentContent : Parser<_> =
skipMany (choice [ comment; skipCommentChar ])
// Skips C-style multiline comment /*...*/ with arbitrary nesting depth.
do commentRef := between (skipString "/*") (skipString "*/") commentContent
/// Prints the strings p skipped over on the console.
let printSkipped p =
p |> withSkippedString (printfn "Skipped: \"%s\" Matched: \"%A\"")
[
"/*simple comment*/"
"/** special / * / case **/"
"/*testing /*multiple*/ /*nested*/ comments*/ not comment anymore"
"/*not closed properly/**/"
]
|> List.iter (fun s ->
printfn "Test Case: \"%s\"" s
run (printSkipped comment) s |> printfn "Result: %A\n"
)
printfn "Press any key to exit..."
Console.ReadKey true |> ignore
By using notFollowedBy to only skip characters that are not part of a comment end marker (*/), there is no need for nested many parsers.
Hope this helps :)
Finally found a way to fix the many problem.
Replaced my custom skipManyTillEx with another custom function I called skipManyTill1Ex.
skipManyTill1Ex, in contrast to the previous skipManyTillEx only succeeds if it parsed 1 or more p successfully.
I expected the test for the empty comment /**/ to fail for this version but it works.
...
let skipManyTill1Ex (p : Parser<_,_>) (endp : Parser<_,_>) : Parser<unit,unit> =
fun stream ->
let tryParse (p : Parser<_,_>) (stm : CharStream<unit>) : bool =
let spre = stm.State
let reply = p stm
match reply.Status with
| ReplyStatus.Ok ->
stream.BacktrackTo spre
true
| _ ->
stream.BacktrackTo spre
false
let initialState = stream.State
let mutable preply = preturn () stream
let mutable looping = true
let mutable matchCounter = 0
while (not (tryParse endp stream)) && looping do
preply <- p stream
match preply.Status with
| ReplyStatus.Ok ->
matchCounter <- matchCounter + 1
()
| _ -> looping <- false
match (preply.Status, matchCounter) with
| (ReplyStatus.Ok, c) when (c > 0) -> preply
| (_,_) ->
let myReply = Reply(Error, mergeErrors preply.Error (messageError "skipManyTill1Ex failed") )
stream.BacktrackTo initialState
myReply
let ublockComment, ublockCommentImpl = createParserForwardedToRef()
let bcopenTag = "/*"
let bccloseTag = "*/"
let pbcopen = pstring bcopenTag
let pbcclose = pstring bccloseTag
let ignoreCommentContent : Parser<unit,unit> = skipManyTill1Ex (skipAnyChar) (choice [pbcopen; pbcclose] |>> fun x -> ())
let ignoreSubComment : Parser<unit,unit> = ublockComment
let commentContent : Parser<unit,unit> = skipMany (choice [ignoreCommentContent; ignoreSubComment])
do ublockCommentImpl := between (pbcopen) (pbcclose) (commentContent) |>> fun c -> ()
do test (skipManyTillEx (pchar 'a' |>> fun c -> ()) (pchar 'b') >>. (restOfLine true)) "aaaabcccc"
do test ublockComment "/**/"
do test ublockComment "/* This is a comment \n With multiple lines. */"
do test ublockComment "/* Bla bla bla /* nested bla bla */ more outer bla bla */"

Resources