I try to make a frontend for a kind of programs... there are 2 particularities:
1) When we meet a string beginning with =, I want to read the rest of the string as a formula instead of a string value. For instance, "123", "TRUE", "TRUE+123" are considered having string as type, while "=123", "=TRUE", "=TRUE+123" are considered having Syntax.formula as type. By the way,
(* in syntax.ml *)
and expression =
| E_formula of formula
| E_string of string
...
and formula =
| F_int of int
| F_bool of bool
| F_Plus of formula * formula
| F_RC of rc
and rc =
| RC of int * int
2) Inside the formula, some strings are interpreted differently from outside. For instance, in a command R4C5 := 4, R4C5 which is actually a variable, is considered as a identifier, while in "=123+R4C5" which tries to be translated to a formula, R4C5 is translated as RC (4,5): rc.
So I don't know how to realize this with 1 or 2 lexers, and 1 or 2 parsers.
At the moment, I try to realize all in 1 lexer and 1 parser. Here is part of code, which doesn't work, it still considers R4C5 as identifier, instead of rc:
(* in lexer.mll *)
let begin_formula = double_quote "="
let end_formula = double_quote
let STRING = double_quote ([^ "=" ])* double_quote
rule token = parse
...
| begin_formula { BEGIN_FORMULA }
| 'R' { R }
| 'C' { C }
| end_formula { END_FORMULA }
| lex_identifier as li
{ try Hashtbl.find keyword_table (lowercase li)
with Not_found -> IDENTIFIER li }
| STRING as s { STRING s }
...
(* in parser.mly *)
expression:
| BEGIN_FORMULA f = formula END_FORMULA { E_formula f }
| s = STRING { E_string s }
...
formula:
| i = INTEGER { F_int i }
| b = BOOL { F_bool b }
| f0 = formula PLUS f1 = formula { F_Plus (f0, f1) }
| rc { F_RC $1 }
rc:
| R i0 = INTEGER C i1 = INTEGER { RC (i0, i1) }
Could anyone help?
New idea: I am thinking of sticking on 1 lexer + 1 parser, and create a entrypoint for formula in lexer as what we do normally for comment... here are some updates in lexer.mll and parser.mly:
(* in lexer.mll *)
rule token = parse
...
| begin_formula { formula lexbuf }
...
| INTEGER as i { INTEGER (int_of_string i) }
| '+' { PLUS }
...
and formula = parse
| end_formula { token lexbuf }
| INTEGER as i { INTEGER_F (int_of_string i) }
| 'R' { R }
| 'C' { C }
| '+' { PLUS_F }
| _ { raise (Lexing_error ("unknown in formula")) }
(* in parser.mly *)
expression:
| formula { E_formula f }
...
formula:
| i = INTEGER_F { F_int i }
| f0 = formula PLUS_F f1 = formula { F_Plus (f0, f1) }
...
I have done some tests, for instance to parse "=R4", the problem is that it can parse well R, but it considers 4 as INTEGER instead of INTEGER_F, it seems that formula lexbuf needs to be added from time to time in the body of formula entrypoint (Though I don't understand why parsing in the body of token entrypoint works without always mentioning token lexbuf). I have tried several possibilities: | 'R' { R; formula lexbuf }, | 'R' { formula lexbuf; R }, etc. but it didn't work... ... Could anyone help?
I think the simplest choice would be to have two different lexers and two different parsers; call the lexer&parser for formulas from inside the global parser. After the fact you can see how much is shared between the two grammars, and factorize things when possible.
Related
What I would like to do
I would like to correctly parse minus floating-point numbers.
How should I fix my code?
What is not working
When I try to interpret - 5 as -5.000000, it shows me this error.
Fatal error: exception Stdlib.Parsing.Parse_error
1c1
< error: parse error at char=0, near token '-'
---
> - 5 = -5.000000
My source code
calc_ast.ml
(* abstract syntax tree *)
type expr =
Num of float
| Plus of expr * expr
| Times of expr * expr
| Div of expr * expr
| Minus of expr * expr
;;
calc_lex.ml
{
open Calc_parse
;;
}
rule lex = parse
| [' ' '\t' '\n' ] { lex lexbuf }
| '-'? ['0' - '9']+ as s { NUM(float_of_string s) }
| '-'? ['0' - '9']+ ('.' digit*)? as s { NUM(float_of_string s) }
| '+' { PLUS }
| '-' { MINUS }
| '*' { TIMES }
| '/' { DIV }
| '(' { LPAREN }
| ')' { RPAREN }
| eof { EOF }
calc_parse.mly
%{
%}
%token <float> NUM
%token PLUS TIMES EOF MINUS DIV LPAREN RPAREN
%start program
%type <Calc_ast.expr> program
%%
program :
| compound_expr EOF { $1 }
compound_expr :
| expr { $1 }
| LPAREN expr RPAREN { $2 }
expr :
| mul { $1 }
| expr PLUS mul { Calc_ast.Plus($1, $3) }
| expr MINUS mul { Calc_ast.Minus($1, $3) }
mul :
| NUM { Calc_ast.Num $1 }
| mul TIMES NUM { Calc_ast.Times($1, Calc_ast.Num $3) }
| mul DIV NUM { Calc_ast.Div($1, Calc_ast.Num $3) }
%%
calc.ml
open Calc_parse
(* token -> string *)
let string_of_token t =
match t with
NUM(s) -> Printf.sprintf "NUM(%f)" s
| PLUS -> "PLUS"
| TIMES -> "TIMES"
| MINUS -> "MINUS"
| DIV -> "DIV"
| LPAREN -> "LPAREN"
| RPAREN -> "RPAREN"
| EOF -> "EOF"
;;
(* print token t and return it *)
let print_token t =
Printf.printf "%s\n" (string_of_token t);
t
;;
(* apply lexer to string s *)
let lex_string s =
let rec loop b =
match print_token (Calc_lex.lex b) with
EOF -> ()
| _ -> loop b
in
loop (Lexing.from_string s)
;;
(* apply parser to string s;
show some info when a parse error happens *)
let parse_string s =
let b = Lexing.from_string s in
try
program Calc_lex.lex b (* main work *)
with Parsing.Parse_error as exn ->
(* handle parse error *)
let c0 = Lexing.lexeme_start b in
let c1 = Lexing.lexeme_end b in
Printf.fprintf stdout
"error: parse error at char=%d, near token '%s'\n"
c0 (String.sub s c0 (c1 - c0));
raise exn
;;
(* evaluate expression (AST tree) *)
let rec eval_expr e =
match e with
Calc_ast.Num(c) -> c
| Calc_ast.Plus(e0, e1)
-> (eval_expr e0) +. (eval_expr e1)
| Calc_ast.Minus(e0, e1)
-> (eval_expr e0) -. (eval_expr e1)
| Calc_ast.Times(e0, e1)
-> (eval_expr e0) *. (eval_expr e1)
| Calc_ast.Div(e0, e1)
-> (eval_expr e0) /. (eval_expr e1)
;;
(* evaluate string *)
let eval_string s =
let e = parse_string s in
eval_expr e
;;
(* evaluate string and print it *)
let eval_print_string s =
let y = eval_string s in
Printf.printf "%s = %f\n" s y
;;
let eval_print_stdin () =
let ch = stdin in
let s = input_line ch in
eval_print_string (String.trim s)
;;
let main argv =
eval_print_stdin ()
;;
if not !Sys.interactive then
main Sys.argv
;;
As indicated in the comments, it's almost never a good idea for the lexical analyser to try to recognise the - as part of a numeric literal:
Since the lexical token must be a contiguous string, - 5 will not match. Instead, you'll get two tokens. So you need to handle that in the parser anyway.
On the other hand, if you don't put a space after the -, then 3-4 will be analysed as the two tokens 3 and -4, which is also going to lead to a syntax error.
A simple solution is to add term to recognise the unary negation operator:
mul :
| term { Calc_ast.Num $1 }
| mul TIMES term { Calc_ast.Times($1, Calc_ast.Num $3) }
| mul DIV term { Calc_ast.Div($1, Calc_ast.Num $3) }
term :
| NUM { $1 }
| MINUS term { Calc_ast.Minus(0, $2) }
| LPAREN expr RPAREN { $2 }
In the above, I also moved the handling of parentheses from the bottom to the top of the hierarchy, in order to make 4*(5+3) possible. With that change, you will no longer require compound_expr.
IHello!
I'm trying to code a lexer and parser for computer algebric system.
When I'm compiling the code with my makefile, I have problems on the value of some functions.
Here is the code of function.ml :
(****************************************
* Type definitions
****************************************)
type operator = Plus | Minus | Times | Div | Power;;
type var = string;; (* e.g. "x", "y", etc. *)
type power = var * float;; (* var, raised_to, e.g. ("x", 3.) -> x^3. *)
type monomial = float * power list;; (* coefficient, list of power terms, e.g. (2., [("x", 1.); ("y", 3.)]) -> 2xy^3 *)
type polynomial = monomial list;; (* sum of monomials, e.g. [(2., [("x", 2.); ("y", 3.)]); (-1., [])] -> 2x^2y^3 - 1 *)
type frac = polynomial * polynomial;; (* numerator, denominator *)
type exp = frac * frac;; (* base, exponent *)
type term = Poly of polynomial
| Frac of frac
| Exp of exp;;
type expr = Leaf of term
| Node of operator * expr list;;
type eqn = expr * expr;;
(****************************************
* Lexer/Parser definitions
****************************************)
type token = PLUS | MINUS | TIMES | DIV | POWER | LPAREN | RPAREN | EQUALS
| FLOAT of float
| VAR of var
| EOF;;
Here is the code of lexer.mll :
{
open Function
}
let numeric = ['0' - '9']
let letter = ['a' - 'z' 'A' - 'Z']
rule main = parse
| [' ' '\t' '\n'] { main lexbuf } (* skip over whitespace *)
| "+" { PLUS }
| "-" { MINUS }
| "*" { TIMES }
| "/" { DIV }
| "^" { POWER }
| "(" { LPAREN }
| ")" { RPAREN }
| "=" { EQUALS }
| ((numeric*) '.' (numeric+)) as n
{ FLOAT (float_of_string n) }
| (numeric+) as n
{ FLOAT (float_of_string n) }
| (letter numeric*) as n
{ VAR n }
| eof { EOF }
{
let lexer_main = main;;
let token_iterator_of_string s =
let rec lbuf = Lexing.from_string s
in fun () -> lexer_main lbuf;;
let token_list_of_string s =
let rec lbuf = Lexing.from_string s
and token_list_aux () =
let token = lexer_main lbuf in
if token = EOF then
[]
else
token :: token_list_aux ()
in token_list_aux ();;
}
Here is the code of parser.mly :
%{
open Function
%}
%token PLUS MINUS TIMES DIV POWER LPAREN RPAREN EQUALS EOF
%token <float> FLOAT
%token <Function.var> VAR
%start yacc_eqn
%start yacc_expr
%type <Function.eqn> yacc_eqn
%type <Function.expr> yacc_expr
%%
yacc_eqn:
exp EQUALS exp EOF { ($1, $3) }
yacc_expr:
exp EOF { $1 }
exp:
op1 { $1 }
op1:
op2 { $1 }
| op1 PLUS op2 { Node(Plus, [$1; $3]) }
| op1 MINUS op2 { Node(Minus, [$1; $3]) }
op2:
op3 { $1 }
| op2 TIMES op3 { Node(Times, [$1; $3]) }
| op2 DIV op3 { Node(Div, [$1; $3]) }
op3:
op4 { $1 }
| op3 op4 { Node(Times, [$1; $2]) }
op4:
leaf { $1 }
| op4 POWER leaf { Node(Power, [$1; $3]) }
leaf:
atom { $1 }
| LPAREN exp RPAREN { $2 }
atom:
VAR { Leaf(Poly(poly_of_var $1)) }
| FLOAT { Leaf(Poly(poly_of_float $1)) }
%%
let eqn_of_string s = yacc_eqn Lexer.lexer_main (Lexing.from_string s);;
let expr_of_string s = yacc_expr Lexer.lexer_main (Lexing.from_string s);;
let parse_eqn = eqn_of_string;;
let parse_expr = expr_of_string;;
The problem is that in the parser.mli which is create with the makefile, the value of yacc_eqn and yacc_expr are :
val yacc_eqn :
(Lexing.lexbuf -> token) -> Lexing.lexbuf -> Function.eqn
val yacc_expr :
(Lexing.lexbuf -> token) -> Lexing.lexbuf -> Function.expr
And I have the following error :
The implementation parser.ml does not match the interface parser.cmi:
Values do not match:
val yacc_eqn :
(Lexing.lexbuf -> Function.token) -> Lexing.lexbuf -> Function.eqn
is not included in
val yacc_eqn :
(Lexing.lexbuf -> token) -> Lexing.lexbuf -> Function.eqn
I think the solution might be to something like a cast, but I have stricly no idea how to do that ... Anyone help ?
Thanks in advance !
I am creating a compiler and am trying to extract line information from the parser. I wish to attach this to the AST node as metadata so that any error at a later point can be reported easily. I was successfully able to extract the line information in the Lexer by using this:
exception LexErr of string
exception ParseErr of string
let error msg start finish =
Printf.sprintf "(line %d: char %d..%d): %s" start.pos_lnum
(start.pos_cnum -start.pos_bol) (finish.pos_cnum - finish.pos_bol) msg
let lex_error lexbuf =
raise ( LexErr (error (lexeme lexbuf) (lexeme_start_p lexbuf) (lexeme_end_p lexbuf)))
This generates the line number, char number for Lexer perfectly after using it in this manner:
rule read = parse
(* Lexing tokens *)
| _ { lex_error lexbuf }
For parser, I am using this method:
exception LexErr of string
exception ParseErr of string
let error msg start finish =
Printf.sprintf "(line %d: char %d..%d): %s" start.pos_lnum
(start.pos_cnum -start.pos_bol) (finish.pos_cnum - finish.pos_bol) msg
let parse_error msg nterm =
raise (ParseErr (error msg (rhs_start_pos nterm) (rhs_end_pos nterm)))
My parser looks like this:
%start <Ast.stmt> program
%%
program:
| s = stmt; EOF { s }
;
stmt:
| TINT; e = expr { Decl(e) }
| e1 = expr; EQUALS; e2 = expr { Assign(e1,e2) }
| error { parse_error "wsorword" 1 }
;
expr:
| i = INT; { Const i }
| x = ID { Var x }
| e1 = expr; b = binop; e2 = expr; { Binop(e1,b,e2) }
;
binop:
| SUM { Sum }
| SUB { Sub }
| MUL { Mul }
| DIV { Div }
;
On running this, if a parser error is detected, it throws the invalid_argument "Index out of bounds" exception. This is detected on raise (ParseErr (error msg (rhs_start_pos nterm) (rhs_end_pos nterm))) line. I would ultimately like to create an AST node which contains this parser line information as it's metadata but can't get through this exception. I am not sure if my method of implementation is wrong or if I'm making some other mistake. Would love some help on this.
The function rhs_start_pos nth can not be used with menhir parsers; in this case, you should use $symbolstartpos or $startpos.
Similarly, e = expr is not valid with ocamlyacc.
Thus, I am not sure which parser generator you are trying to use.
In FSYACC it is common to have terminals that result in tuples. However, for convenience I want to use a record type instead. For example, if I have the following in my Abstract Syntax Tree (AbstractSyntaxTree.fsl):
namespace FS
module AbstractSyntaxTree =
type B = { x : int; y : int }
type Either =
| Record of B
| Tuple of int * string
type A =
| Int of int
| String of string
| IntTuple of Either
I'm not clear on the correct syntax in FSYACC (parser.fsy), because if I use:
%start a
%token <string> STRING
%token <System.Int32> INT
%token ATOMTOKEN TUPLETOKEN EOF
%type < A > a
%%
a:
| atomS { $1 }
| atomI { $1 }
| either { $1 }
atomI:
| ATOMTOKEN INT { Int($2) }
atomS:
| ATOMTOKEN STRING { String($2) }
either:
| TUPLETOKEN INT INT { Record {x=$2;y=$3} } // !!!
| TUPLETOKEN TUPLETOKEN INT STRING { Tuple( $3, $4) } // !!!
I would expect the type B and the Tuple to be inferred. However, FSYACC gives the error for both of the lines marked with "!!!":
This expression was expected to have type A but here has type Either
What is the correct syntax to for the "either" production on the last two lines?
Don't you mean IntTuple($2, $3) as opposed to B($2, $3)? I'd try IntTuple{x=$2; y=$3}
EDIT: this works:
module Ast
type B = { x : int; y : int }
type A =
| Int of int
| String of string
| IntTuple of B
and
%{
open Ast
%}
%start a
%token <string> STRING
%token <System.Int32> INT
%token ATOMTOKEN TUPLETOKEN
%type < Ast.A > a
%%
a:
| atom { $1 }
| tuple { $1 }
atom:
| ATOMTOKEN INT { Int($2) }
| ATOMTOKEN STRING { String($2) }
tuple:
| TUPLETOKEN INT INT { IntTuple {x = $2; y = $3} }
EDIT 2: Take good care, that the line %type < Ast.A > a requires your non-terminal a to be of type Ast.A. So therefore, since you are using the non-terminal tuple directly, tuple needs to be of type Ast.A. As such, you have to wrap the record in IntTuple, so the syntax is IntTuple {x = $2; y = $3} as opposed to just {x = $2; y = $3}.
I would like to implement the following grammar in OCaml using Menhir parser.
There should be four different statement coming each after another, however, any three of them can be missing. So any program contains at least one of these statements, but can contain more coming in some specific order.
Here is the grammar:
main = A (B) (C) (D)
| (A) B (C) (D)
| (A) (B) C (D)
| (A) (B) (C) D
Is it possible to express it in a more concise representation?
Here is an example of parser.mly for this grammar:
%token <char> ACHAR BCHAR CCHAR DCHAR
%token EOF
%start <char option list> main
%type <char> a b c d
%%
main:
a option(b) option(c) option(d) { [Some($1); $2; $3; $4] }
| option(a) b option(c) option(d) { [$1; Some($2); $3; $4] }
| option(a) option(b) c option(d) { [$1; $2; Some($3); $4] }
| option(a) option(b) option(c) d { [$1; $2; $3; Some($4)] }
| EOF { [] }
a:
ACHAR { $1 } (* returns 'A' *)
b:
BCHAR { $1 } (* returns 'B' *)
c:
CCHAR { $1 } (* returns 'C' *)
d:
DCHAR { $1 } (* returns 'D' *)
For this case menhir produces warnings:
Warning: production option(a) -> a is never reduced.
Warning: production option(d) -> d is never reduced.
and cases such as A B C D, A, A C, B D are not matched. How to improve the grammar/parser implementation in order to fix this?
Try this:
main:
a option(b) option(c) option(d) { [Some($1); $2; $3; $4] }
| b option(c) option(d) { [None; Some($1); $2; $3] }
| c option(d) { [None; None; Some($1); $2] }
| d { [None; None; None; Some($1)] }
I removed the last option, which matches the empty sequence, because it contradicts your requirement that at least one of a, b, c or d be present. If you are prepared to accept empty, you could just use
main:
option(a) option(b) option(c) option(d) { [$1; $2; $3; $4] }
although you might want to adjust the action to return [] in the case where all four options are None.
You can write a? instead of option(a).
Also if you want to return four elements, you should use a tuple instead of a list.