I am creating a compiler and am trying to extract line information from the parser. I wish to attach this to the AST node as metadata so that any error at a later point can be reported easily. I was successfully able to extract the line information in the Lexer by using this:
exception LexErr of string
exception ParseErr of string
let error msg start finish =
Printf.sprintf "(line %d: char %d..%d): %s" start.pos_lnum
(start.pos_cnum -start.pos_bol) (finish.pos_cnum - finish.pos_bol) msg
let lex_error lexbuf =
raise ( LexErr (error (lexeme lexbuf) (lexeme_start_p lexbuf) (lexeme_end_p lexbuf)))
This generates the line number, char number for Lexer perfectly after using it in this manner:
rule read = parse
(* Lexing tokens *)
| _ { lex_error lexbuf }
For parser, I am using this method:
exception LexErr of string
exception ParseErr of string
let error msg start finish =
Printf.sprintf "(line %d: char %d..%d): %s" start.pos_lnum
(start.pos_cnum -start.pos_bol) (finish.pos_cnum - finish.pos_bol) msg
let parse_error msg nterm =
raise (ParseErr (error msg (rhs_start_pos nterm) (rhs_end_pos nterm)))
My parser looks like this:
%start <Ast.stmt> program
%%
program:
| s = stmt; EOF { s }
;
stmt:
| TINT; e = expr { Decl(e) }
| e1 = expr; EQUALS; e2 = expr { Assign(e1,e2) }
| error { parse_error "wsorword" 1 }
;
expr:
| i = INT; { Const i }
| x = ID { Var x }
| e1 = expr; b = binop; e2 = expr; { Binop(e1,b,e2) }
;
binop:
| SUM { Sum }
| SUB { Sub }
| MUL { Mul }
| DIV { Div }
;
On running this, if a parser error is detected, it throws the invalid_argument "Index out of bounds" exception. This is detected on raise (ParseErr (error msg (rhs_start_pos nterm) (rhs_end_pos nterm))) line. I would ultimately like to create an AST node which contains this parser line information as it's metadata but can't get through this exception. I am not sure if my method of implementation is wrong or if I'm making some other mistake. Would love some help on this.
The function rhs_start_pos nth can not be used with menhir parsers; in this case, you should use $symbolstartpos or $startpos.
Similarly, e = expr is not valid with ocamlyacc.
Thus, I am not sure which parser generator you are trying to use.
Related
What I would like to do
I would like to correctly parse minus floating-point numbers.
How should I fix my code?
What is not working
When I try to interpret - 5 as -5.000000, it shows me this error.
Fatal error: exception Stdlib.Parsing.Parse_error
1c1
< error: parse error at char=0, near token '-'
---
> - 5 = -5.000000
My source code
calc_ast.ml
(* abstract syntax tree *)
type expr =
Num of float
| Plus of expr * expr
| Times of expr * expr
| Div of expr * expr
| Minus of expr * expr
;;
calc_lex.ml
{
open Calc_parse
;;
}
rule lex = parse
| [' ' '\t' '\n' ] { lex lexbuf }
| '-'? ['0' - '9']+ as s { NUM(float_of_string s) }
| '-'? ['0' - '9']+ ('.' digit*)? as s { NUM(float_of_string s) }
| '+' { PLUS }
| '-' { MINUS }
| '*' { TIMES }
| '/' { DIV }
| '(' { LPAREN }
| ')' { RPAREN }
| eof { EOF }
calc_parse.mly
%{
%}
%token <float> NUM
%token PLUS TIMES EOF MINUS DIV LPAREN RPAREN
%start program
%type <Calc_ast.expr> program
%%
program :
| compound_expr EOF { $1 }
compound_expr :
| expr { $1 }
| LPAREN expr RPAREN { $2 }
expr :
| mul { $1 }
| expr PLUS mul { Calc_ast.Plus($1, $3) }
| expr MINUS mul { Calc_ast.Minus($1, $3) }
mul :
| NUM { Calc_ast.Num $1 }
| mul TIMES NUM { Calc_ast.Times($1, Calc_ast.Num $3) }
| mul DIV NUM { Calc_ast.Div($1, Calc_ast.Num $3) }
%%
calc.ml
open Calc_parse
(* token -> string *)
let string_of_token t =
match t with
NUM(s) -> Printf.sprintf "NUM(%f)" s
| PLUS -> "PLUS"
| TIMES -> "TIMES"
| MINUS -> "MINUS"
| DIV -> "DIV"
| LPAREN -> "LPAREN"
| RPAREN -> "RPAREN"
| EOF -> "EOF"
;;
(* print token t and return it *)
let print_token t =
Printf.printf "%s\n" (string_of_token t);
t
;;
(* apply lexer to string s *)
let lex_string s =
let rec loop b =
match print_token (Calc_lex.lex b) with
EOF -> ()
| _ -> loop b
in
loop (Lexing.from_string s)
;;
(* apply parser to string s;
show some info when a parse error happens *)
let parse_string s =
let b = Lexing.from_string s in
try
program Calc_lex.lex b (* main work *)
with Parsing.Parse_error as exn ->
(* handle parse error *)
let c0 = Lexing.lexeme_start b in
let c1 = Lexing.lexeme_end b in
Printf.fprintf stdout
"error: parse error at char=%d, near token '%s'\n"
c0 (String.sub s c0 (c1 - c0));
raise exn
;;
(* evaluate expression (AST tree) *)
let rec eval_expr e =
match e with
Calc_ast.Num(c) -> c
| Calc_ast.Plus(e0, e1)
-> (eval_expr e0) +. (eval_expr e1)
| Calc_ast.Minus(e0, e1)
-> (eval_expr e0) -. (eval_expr e1)
| Calc_ast.Times(e0, e1)
-> (eval_expr e0) *. (eval_expr e1)
| Calc_ast.Div(e0, e1)
-> (eval_expr e0) /. (eval_expr e1)
;;
(* evaluate string *)
let eval_string s =
let e = parse_string s in
eval_expr e
;;
(* evaluate string and print it *)
let eval_print_string s =
let y = eval_string s in
Printf.printf "%s = %f\n" s y
;;
let eval_print_stdin () =
let ch = stdin in
let s = input_line ch in
eval_print_string (String.trim s)
;;
let main argv =
eval_print_stdin ()
;;
if not !Sys.interactive then
main Sys.argv
;;
As indicated in the comments, it's almost never a good idea for the lexical analyser to try to recognise the - as part of a numeric literal:
Since the lexical token must be a contiguous string, - 5 will not match. Instead, you'll get two tokens. So you need to handle that in the parser anyway.
On the other hand, if you don't put a space after the -, then 3-4 will be analysed as the two tokens 3 and -4, which is also going to lead to a syntax error.
A simple solution is to add term to recognise the unary negation operator:
mul :
| term { Calc_ast.Num $1 }
| mul TIMES term { Calc_ast.Times($1, Calc_ast.Num $3) }
| mul DIV term { Calc_ast.Div($1, Calc_ast.Num $3) }
term :
| NUM { $1 }
| MINUS term { Calc_ast.Minus(0, $2) }
| LPAREN expr RPAREN { $2 }
In the above, I also moved the handling of parentheses from the bottom to the top of the hierarchy, in order to make 4*(5+3) possible. With that change, you will no longer require compound_expr.
I want handle some ambiguities in dypgen. I found something in the manual, that I want to know, how I can use that.
In the manual point 5.2 "Pattern matching on Symbols" there is an example:
expr:
| expr OP<"+"> expr { $1 + $2 }
| expr OP<"*"> expr { $1 * $2 }
OP is matched with "+" or "*", as I understand. I also find there:
The patterns can be any Caml patterns (but without the keyword when).
For instance this is possible:
expr: expr<(Function([arg1;arg2],f_body)) as f> expr
{ some action }
So I tried to put there some other expressions, but I dont understand, what happens. If I put in there printf it outputs the value of the matched string. But if I put in there (fun x -> printf x), that seems to me the same as printf, dypgen complains about a syntax error and points to the end of the expression. If I put Printf.printf in there, it complains about Syntax error: operator expected. And if I put there (fun x -> Printf.printf x) it says: Lexing failed with message: lexing: empty token
What do these different error-messages mean?
In the end I would like to look up something in a hashtable, if the value is in there, but I don't know, if it is possible this way. Is it or isn't it possible?
EDIT: A minimal example derived from the forest-example from the dypgen-demos.
The grammarfile forest_parser.dyp contains:
{
open Parse_tree
let dyp_merge = Dyp.keep_all
}
%start main
%layout [' ' '\t']
%%
main : np "." "\n" { $1 }
np:
| sg {Noun($1)}
| pl {Noun($1)}
sg: word <Word("sheep"|"fish")> {Sg($1)}
sg: word <Word("cat"|"dog")> {Sg($1)}
pl: word <Word("sheep"|"fish")> {Pl($1)}
pl: word <Word("cats"|"dogs")> {Pl($1)}
/* OR try:
sg: word <printf> {Sg($1)}
pl: word <printf> {Pl($1)}
*/
word:
| (['A'-'Z' 'a'-'z']+) {Word($1)}
The forest.ml has the following print_forest-function now:
let print_forest forest =
let rec aux1 t = match t with
| Word x
-> print_string x
| Noun (x) -> (
print_string "N [";
aux1 x;
print_string " ]")
| Sg (x) -> (
print_string "Sg [";
aux1 x;
print_string " ]")
| Pl (x) -> (
print_string "Pl [";
aux1 x;
print_string " ]")
in
let aux2 t = aux1 t; print_newline () in
List.iter aux2 forest;
print_newline ()
And the parser_tree.mli contains:
type tree =
| Word of string
| Noun of tree
| Sg of tree
| Pl of tree
And then you can determine, what numeri fish, sheep, cat(s) etc. are.
sheep or fish can be singular and plural. cats and dogs cannot.
fish.
N [Sg [fish ] ]
N [Pl [fish ] ]
I know nothing about Dypgen so I tried to figure it out.
Let's see what I found out.
In the parser.dyp file you can define the lexer and the parser or you can use an external lexer. Here's what I did :
My ast looks like this :
parse_prog.mli
type f =
| Print of string
| Function of string list * string * string
type program = f list
prog_parser.dyp
{
open Parse_prog
(* let dyp_merge = Dyp.keep_all *)
let string_buf = Buffer.create 10
}
%start main
%relation pf<pr
%lexer
let newline = '\n'
let space = [' ' '\t' '\r']
let uident = ['A'-'Z']['a'-'z' 'A'-'Z' '0'-'9' '_']*
let lident = ['a'-'z']['a'-'z' 'A'-'Z' '0'-'9' '_']*
rule string = parse
| '"' { () }
| _ { Buffer.add_string string_buf (Dyp.lexeme lexbuf);
string lexbuf }
main lexer =
newline | space + -> { () }
"fun" -> ANONYMFUNCTION { () }
lident -> FUNCTION { Dyp.lexeme lexbuf }
uident -> MODULE { Dyp.lexeme lexbuf }
'"' -> STRING { Buffer.clear string_buf;
string lexbuf;
Buffer.contents string_buf }
%parser
main : function_calls eof
{ $1 }
function_calls:
|
{ [] }
| function_call ";" function_calls
{ $1 :: $3 }
function_call:
| printf STRING
{ Print $2 } pr
| "(" ANONYMFUNCTION lident "->" printf lident ")" STRING
{ Print $6 } pf
| nested_modules "." FUNCTION STRING
{ Function ($1, $3, $4) } pf
| FUNCTION STRING
{ Function ([], $1, $2) } pf
| "(" ANONYMFUNCTION lident "->" FUNCTION lident ")" STRING
{ Function ([], $5, $8) } pf
printf:
| FUNCTION<"printf">
{ () }
| MODULE<"Printf"> "." FUNCTION<"printf">
{ () }
nested_modules:
| MODULE
{ [$1] }
| MODULE "." nested_modules
{ $1 :: $3 }
This file is the most important. As you can see, if I have a function printf "Test" my grammar is ambiguous and this can be reduced to either Print "Test" or Function ([], "printf", "Test") but !, as I realized, I can give priorities to my rules so if one as a higher priority it will be the one chosen for the first parsing. (try to uncomment let dyp_merge = Dyp.keep_all and you'll see all the possible combinations).
And in my main :
main.ml
open Parse_prog
let print_stlist fmt sl =
match sl with
| [] -> ()
| _ -> List.iter (Format.fprintf fmt "%s.") sl
let print_program tl =
let aux1 t = match t with
| Function (ml, f, p) ->
Format.printf "I can't do anything with %a%s(\"%s\")#." print_stlist ml f p
| Print s -> Format.printf "You want to print : %s#." s
in
let aux2 t = List.iter (fun (tl, _) ->
List.iter aux1 tl; Format.eprintf "------------#.") tl in
List.iter aux2 tl
let input_file = Sys.argv.(1)
let lexbuf = Dyp.from_channel (Forest_parser.pp ()) (Pervasives.open_in input_file)
let result = Parser_prog.main lexbuf
let () = print_program result
And, for example, for the following file :
test
printf "first print";
Printf.printf "nested print";
Format.eprintf "nothing possible";
(fun x -> printf x) "Anonymous print";
If I execute ./myexec test I will get the following prompt
You want to print : first print
You want to print : nested print
I can't do anything with Format.eprintf("nothing possible")
You want to print : x
------------
So, TL;DR, the manual example was just here to show you that you can play with your defined tokens (I never defined the token PRINT, just FUNCTION) and match on them to get new rules.
I hope it's clear, I learned a lot with your question ;-)
[EDIT] So, I changed the parser to match what you wanted to watch :
{
open Parse_prog
(* let dyp_merge = Dyp.keep_all *)
let string_buf = Buffer.create 10
}
%start main
%relation pf<pp
%lexer
let newline = '\n'
let space = [' ' '\t' '\r']
let uident = ['A'-'Z']['a'-'z' 'A'-'Z' '0'-'9' '_']*
let lident = ['a'-'z']['a'-'z' 'A'-'Z' '0'-'9' '_']*
rule string = parse
| '"' { () }
| _ { Buffer.add_string string_buf (Dyp.lexeme lexbuf);
string lexbuf }
main lexer =
newline | space + -> { () }
"fun" -> ANONYMFUNCTION { () }
lident -> FUNCTION { Dyp.lexeme lexbuf }
uident -> MODULE { Dyp.lexeme lexbuf }
'"' -> STRING { Buffer.clear string_buf;
string lexbuf;
Buffer.contents string_buf }
%parser
main : function_calls eof
{ $1 }
function_calls:
|
{ [] } pf
| function_call <Function((["Printf"] | []), "printf", st)> ";" function_calls
{ (Print st) :: $3 } pp
| function_call ";" function_calls
{ $1 :: $3 } pf
function_call:
| nested_modules "." FUNCTION STRING
{ Function ($1, $3, $4) }
| FUNCTION STRING
{ Function ([], $1, $2) }
| "(" ANONYMFUNCTION lident "->" FUNCTION lident ")" STRING
{ Function ([], $5, $8) }
nested_modules:
| MODULE
{ [$1] }
| MODULE "." nested_modules
{ $1 :: $3 }
Here, as you can see, I don't handle the fact that my function is print when I parse it but when I put it in my functions list. So, I match on the algebraic type that was built by my parser. I hope this example is ok for you ;-) (but be warned, this is extremely ambiguous ! :-D)
I'm currently trying Rascal to create a small DSL. I tried to modify the Pico example, however I'm currently stuck. The following code parses examples like a = 3, b = 7 begin declare x : natural, field real # cells blubb; x := 5.7 end parses perfectly, but the implode function fails with the error message "Cannot find a constructor for PROGRAM". I tried various constructor declarations, however none seemed to fit. Is there a way to see what the expected constructor looks like?
Syntax:
module BlaTest::Syntax
import Prelude;
lexical Identifier = [a-z][a-z0-9]* !>> [a-z0-9];
lexical NaturalConstant = [0-9]+;
lexical IntegerConstant = [\-+]? NaturalConstant;
lexical RealConstant = IntegerConstant "." NaturalConstant;
lexical StringConstant = "\"" ![\"]* "\"";
layout Layout = WhitespaceAndComment* !>> [\ \t\n\r%];
lexical WhitespaceAndComment
= [\ \t\n\r]
| #category="Comment" "%" ![%]+ "%"
| #category="Comment" "%%" ![\n]* $
;
start syntax Program
= program: {ExaOption ","}* exadomain "begin" Declarations decls {Statement ";"}* body "end"
;
syntax Domain = "domain" "{" ExaOption ", " exaoptions "}"
;
syntax ExaOption = Identifier id "=" Expression val
;
syntax Declarations
= "declare" {Declaration ","}* decls ";" ;
syntax Declaration
= variable_declaration: Identifier id ":" Type tp
| field_declaration: "field" Type tp "#" FieldLocation fieldLocation Identifier id
;
syntax FieldLocation
= exacell: "cells"
| exanode: "nodes"
;
syntax Type
= natural:"natural"
| exareal: "real"
| string :"string"
;
syntax Statement
= asgStat: Identifier var ":=" Expression val
| ifElseStat: "if" Expression cond "then" {Statement ";"}* thenPart "else" {Statement ";"}* elsePart "fi"
| whileStat: "while" Expression cond "do" {Statement ";"}* body "od"
;
syntax Expression
= id: Identifier name
| stringConstant: StringConstant stringconstant
| naturalConstant: NaturalConstant naturalconstant
| realConstant: RealConstant realconstant
| bracket "(" Expression e ")"
> left conc: Expression lhs "||" Expression rhs
> left ( add: Expression lhs "+" Expression rhs
| sub: Expression lhs "-" Expression rhs
)
;
public start[Program] program(str s) {
return parse(#start[Program], s);
}
public start[Program] program(str s, loc l) {
return parse(#start[Program], s, l);
}
Abstract:
module BlaTest::Abstract
public data TYPE = natural() | string() | exareal();
public data FIELDLOCATION = exacell() | exanode();
public alias ExaIdentifier = str;
public data PROGRAM = program(list[OPTION] exadomain, list[DECL] decls, list[STATEMENT] stats);
public data DOMAIN
= domain_declaration(list[OPTION] options)
;
public data OPTION
= exaoption(ExaIdentifier name, EXP exp)
;
public data DECL
= variable_declaration(ExaIdentifier name, TYPE tp)
| field_declaration(TYPE tp, FIELDLOCATION fieldlocation, ExaIdentifier name)
;
public data EXP
= id(ExaIdentifier name)
| naturalConstant(int iVal)
| stringConstant(str sVal)
| realConstant(real rVal)
| add(EXP left, EXP right)
| sub(EXP left, EXP right)
| conc(EXP left, EXP right)
;
public data STATEMENT
= asgStat(ExaIdentifier name, EXP exp)
| ifElseStat(EXP exp, list[STATEMENT] thenpart, list[STATEMENT] elsepart)
| whileStat(EXP exp, list[STATEMENT] body)
;
anno loc TYPE#location;
anno loc PROGRAM#location;
anno loc DECL#location;
anno loc EXP#location;
anno loc STATEMENT#location;
anno loc OPTION#location;
public alias Occurrence = tuple[loc location, ExaIdentifier name, STATEMENT stat];
Load:
module BlaTest::Load
import IO;
import Exception;
import Prelude;
import BlaTest::Syntax;
import BlaTest::Abstract;
import BlaTest::ControlFlow;
import BlaTest::Visualize;
public PROGRAM exaload(str txt) {
PROGRAM p;
try {
p = implode(#PROGRAM, parse(#Program, txt));
} catch ParseError(loc l): {
println("Parse error at line <l.begin.line>, column <l.begin.column>");
}
return p; // return will fail in case of error
}
public Program exaparse(str txt) {
Program p;
try {
p = parse(#Program, txt);
} catch ParseError(loc l): {
println("Parse error at line <l.begin.line>, column <l.begin.column>");
}
return p; // return will fail in case of error
}
Thanks a lot,
Chris
Unfortunately the current implode facility depends on a hidden semantic assumption, namely that the non-terminals in the syntax definition have the same name as the types in the data definitions. So if the non-terminal is called "Program", it should not be called "PROGRAM" but "Program" in the data definition.
We are looking for a smoother way of integrating concrete and abstract syntax trees, but for now please decapitalize your data names.
I'm trying to get familiar with Happy parser generator for Haskell. Currently, I have an example from the documentation but when I compile the program, I get an error.
This is the code:
{
module Main where
import Data.Char
}
%name calc
%tokentype { Token }
%error { parseError }
%token
let { TokenLet }
in { TokenIn }
int { TokenInt $$ }
var { TokenVar $$ }
'=' { TokenEq }
'+' { TokenPlus }
'-' { TokenMinus }
'*' { TokenTimes }
'/' { TokenDiv }
'(' { TokenOB }
')' { TokenCB }
%%
Exp : let var '=' Exp in Exp { \p -> $6 (($2,$4 p):p) }
| Exp1 { $1 }
Exp1 : Exp1 '+' Term { \p -> $1 p + $3 p }
| Exp1 '-' Term { \p -> $1 p - $3 p }
| Term { $1 }
Term : Term '*' Factor { \p -> $1 p * $3 p }
| Term '/' Factor { \p -> $1 p `div` $3 p }
| Factor { $1 }
Factor
: int { \p -> $1 }
| var { \p -> case lookup $1 p of
Nothing -> error "no var"
Just i -> i }
| '(' Exp ')' { $2 }
{
parseError :: [Token] -> a
parseError _ = error "Parse error"
data Token
= TokenLet
| TokenIn
| TokenInt Int
| TokenVar String
| TokenEq
| TokenPlus
| TokenMinus
| TokenTimes
| TokenDiv
| TokenOB
| TokenCB
deriving Show
lexer :: String -> [Token]
lexer [] = []
lexer (c:cs)
| isSpace c = lexer cs
| isAlpha c = lexVar (c:cs)
| isDigit c = lexNum (c:cs)
lexer ('=':cs) = TokenEq : lexer cs
lexer ('+':cs) = TokenPlus : lexer cs
lexer ('-':cs) = TokenMinus : lexer cs
lexer ('*':cs) = TokenTimes : lexer cs
lexer ('/':cs) = TokenDiv : lexer cs
lexer ('(':cs) = TokenOB : lexer cs
lexer (')':cs) = TokenCB : lexer cs
lexNum cs = TokenInt (read num) : lexer rest
where (num,rest) = span isDigit cs
lexVar cs =
case span isAlpha cs of
("let",rest) -> TokenLet : lexer rest
("in",rest) -> TokenIn : lexer rest
(var,rest) -> TokenVar var : lexer rest
main = getContents >>= print . calc . lexer
}
I'm getting this error:
[1 of 1] Compiling Main ( gr.hs, gr.o )
gr.hs:310:24:
No instance for (Show ([(String, Int)] -> Int))
arising from a use of `print'
Possible fix:
add an instance declaration for (Show ([(String, Int)] -> Int))
In the first argument of `(.)', namely `print'
In the second argument of `(>>=)', namely `print . calc . lexer'
In the expression: getContents >>= print . calc . lexer
Do you know why and how can I solve it?
If you examine the error message
No instance for (Show ([(String, Int)] -> Int))
arising from a use of `print'
it's clear that the problem is that you are trying to print a function. And indeed, the value produced by the parser function calc is supposed to be a function which takes a lookup table of variable bindings and gives back a result. See for example the rule for variables:
{ \p -> case lookup $1 p of
Nothing -> error "no var"
Just i -> i }
So in main, we need to pass in a list for the p argument, for example an empty list. (Or you could add some pre-defined global variables if you wanted). I've expanded the point-free code to a do block so it's easier to see what's going on:
main = do
input <- getContents
let fn = calc $ lexer input
print $ fn [] -- or e.g. [("foo", 42)] if you wanted it pre-defined
Now it works:
$ happy Calc.y
$ runghc Calc.hs <<< "let x = 1337 in x * 2"
2674
I try to make a frontend for a kind of programs... there are 2 particularities:
1) When we meet a string beginning with =, I want to read the rest of the string as a formula instead of a string value. For instance, "123", "TRUE", "TRUE+123" are considered having string as type, while "=123", "=TRUE", "=TRUE+123" are considered having Syntax.formula as type. By the way,
(* in syntax.ml *)
and expression =
| E_formula of formula
| E_string of string
...
and formula =
| F_int of int
| F_bool of bool
| F_Plus of formula * formula
| F_RC of rc
and rc =
| RC of int * int
2) Inside the formula, some strings are interpreted differently from outside. For instance, in a command R4C5 := 4, R4C5 which is actually a variable, is considered as a identifier, while in "=123+R4C5" which tries to be translated to a formula, R4C5 is translated as RC (4,5): rc.
So I don't know how to realize this with 1 or 2 lexers, and 1 or 2 parsers.
At the moment, I try to realize all in 1 lexer and 1 parser. Here is part of code, which doesn't work, it still considers R4C5 as identifier, instead of rc:
(* in lexer.mll *)
let begin_formula = double_quote "="
let end_formula = double_quote
let STRING = double_quote ([^ "=" ])* double_quote
rule token = parse
...
| begin_formula { BEGIN_FORMULA }
| 'R' { R }
| 'C' { C }
| end_formula { END_FORMULA }
| lex_identifier as li
{ try Hashtbl.find keyword_table (lowercase li)
with Not_found -> IDENTIFIER li }
| STRING as s { STRING s }
...
(* in parser.mly *)
expression:
| BEGIN_FORMULA f = formula END_FORMULA { E_formula f }
| s = STRING { E_string s }
...
formula:
| i = INTEGER { F_int i }
| b = BOOL { F_bool b }
| f0 = formula PLUS f1 = formula { F_Plus (f0, f1) }
| rc { F_RC $1 }
rc:
| R i0 = INTEGER C i1 = INTEGER { RC (i0, i1) }
Could anyone help?
New idea: I am thinking of sticking on 1 lexer + 1 parser, and create a entrypoint for formula in lexer as what we do normally for comment... here are some updates in lexer.mll and parser.mly:
(* in lexer.mll *)
rule token = parse
...
| begin_formula { formula lexbuf }
...
| INTEGER as i { INTEGER (int_of_string i) }
| '+' { PLUS }
...
and formula = parse
| end_formula { token lexbuf }
| INTEGER as i { INTEGER_F (int_of_string i) }
| 'R' { R }
| 'C' { C }
| '+' { PLUS_F }
| _ { raise (Lexing_error ("unknown in formula")) }
(* in parser.mly *)
expression:
| formula { E_formula f }
...
formula:
| i = INTEGER_F { F_int i }
| f0 = formula PLUS_F f1 = formula { F_Plus (f0, f1) }
...
I have done some tests, for instance to parse "=R4", the problem is that it can parse well R, but it considers 4 as INTEGER instead of INTEGER_F, it seems that formula lexbuf needs to be added from time to time in the body of formula entrypoint (Though I don't understand why parsing in the body of token entrypoint works without always mentioning token lexbuf). I have tried several possibilities: | 'R' { R; formula lexbuf }, | 'R' { formula lexbuf; R }, etc. but it didn't work... ... Could anyone help?
I think the simplest choice would be to have two different lexers and two different parsers; call the lexer&parser for formulas from inside the global parser. After the fact you can see how much is shared between the two grammars, and factorize things when possible.