How can I debug my flex/bison grammar? - parsing

This is a very silly problem. There are no errors in the grammar rules afaik but its not giving the right output. I have been staring at it but the mistake is not visible to me.
What tools are available to me to help me see what is going on in a parse? My attempts to insert tracing code are a lot of work and don't seem to be helping me much.
parser.y
%{
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include "SymbolTable.h"
#include "SymbolInfo.h"
#include "ScopeTable.h"
int yyparse(void);
int yylex(void);
extern char* yytext;
extern FILE * yyin;
extern int tableSize;
FILE *logout;
extern int line_count;
extern char *arr[100];
extern char *final_arr[100];
SymbolTable *table;
void yyerror (const char *s)
{
fprintf(stderr,"%s\n",s);
return;
}
%}
%union {
class SymbolInfo* sym;
char *s;
float f;
}
%error-verbose
%verbose
%token COMMA INT ID SEMICOLON FLOAT VOID LCURL RCURL RETURN NOT IF FOR WHILE PRINTLN LPAREN RPAREN
%token CONST_INT CONST_FLOAT LTHIRD RTHIRD
%token ADDOP MULOP INCOP DECOP RELOP LOGICOP ASSIGNOP
%token <f> DOUBLE
//%expect 1
%precedence THEN
%precedence ELSE
%left "<" ">" "<=" ">=" "=" "!="
%left "+" "-"
%left "*" "/"
%left UMINUS
%%
start : program { printf("start -> program\n");
fprintf(logout,"%d : start -> program\n",line_count);
}
;
program : program unit {
printf("program -> program unit\n");
fprintf(logout,"%d : program -> program unit\n\n",line_count);
for(int j = 0; final_arr[j] != NULL; j++)
{
fprintf(logout,"%s",final_arr[j]);
}
fprintf(logout,"\n\n");
}
| unit {
printf("program -> unit\n");
fprintf(logout,"%d : program -> unit\n\n",line_count);
for(int j = 0; final_arr[j] != NULL; j++)
{
fprintf(logout,"%s",final_arr[j]);
}
fprintf(logout,"\n\n");
}
;
unit : var_dec {
printf("unit -> var_dec\n");
fprintf(logout,"%d : unit -> var_dec\n\n",line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
|func_declaration {
fprintf(logout,"%d : unit -> func_declaration\n\n",line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
|func_definition {
fprintf(logout,"%d : unit -> func_definition\n\n",line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
;
;
func_declaration : type_specifier ID LPAREN parameter_list RPAREN SEMICOLON {
printf("func_declaration -> type_specifier id LPAREN parameter_list RPAREN SEMICOLON\n");
fprintf(logout,"%d : func_declaration : type_specifier ID LPAREN parameter_list RPAREN SEMICOLON\n\n", line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
| type_specifier ID LPAREN RPAREN SEMICOLON {
printf("func_declaration -> type_specifier id LPAREN RPAREN SEMICOLON\n");
fprintf(logout,"%d : func_declaration : type_specifier ID LPAREN parameter_list RPAREN SEMICOLON\n\n", line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
;
func_definition : type_specifier ID LPAREN parameter_list RPAREN compound_statement {
printf("func_definition -> type_specifier ID LPAREN parameter_list RPAREN compound_statement\n");
fprintf(logout,"%d : func_definition : type_specifier ID LPAREN parameter_list RPAREN compound_statement\n\n", line_count);
}
| type_specifier ID LPAREN RPAREN compound_statement {
printf("func_definition -> type_specifier id LPAREN RPAREN compound_statement\n");
fprintf(logout,"%d : func_definition : type_specifier ID LPAREN RPAREN compound_statement\n\n", line_count);
}
;
parameter_list : parameter_list COMMA type_specifier ID {
printf("parameter_list -> parameter_list COMMA type_specifier ID\n");
fprintf(logout,"%d : parameter_list : parameter_list COMMA type_specifier ID\n\n", line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
| parameter_list COMMA type_specifier {
printf("parameter_list -> parameter_list COMMA type_specifier\n");
fprintf(logout,"%d : parameter_list : parameter_list COMMA type_specifier\n\n", line_count);
}
| type_specifier ID {
printf("parameter_list -> type_specifier ID\n");
fprintf(logout,"%d : parameter_list : type_specifier ID\n\n", line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
| type_specifier {
printf("parameter_list -> type_specifier\n");
fprintf(logout,"%d : parameter_list : type_specifier \n\n", line_count);
}
;
compound_statement : LCURL statements RCURL {
printf("compound_statement -> LCURL statements RCURL\n");
fprintf(logout,"compound_statement : LCURL statements RCURL\n\n");
}
| LCURL RCURL
;
var_dec: type_specifier declaration_list SEMICOLON {
printf("var_dec -> type_specifier declaration_list SEMICOLON \n");
fprintf(logout,"%d : var_dec: type_specifier declaration_list SEMICOLON \n\n", line_count);
for(int j = 0; arr[j] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
;
type_specifier : INT {printf("type_specifier -> INT\n");
fprintf(logout,"%d : type_specifier-> INT\n\n%s\n\n", line_count,yytext);
}
| FLOAT {printf("type_specifier ->FLOAT\n");
fprintf(logout,"%d : type_specifier-> FLOAT\n\n%s\n\n",line_count, yytext);
}
| VOID {printf("type_specifier -> VOID\n");
fprintf(logout,"%d : type_specifier-> VOID\n\n%s\n\n",line_count, yytext);
}
;
declaration_list : declaration_list COMMA ID {
printf("declaration_list -> declaration_list COMMA ID\n");
fprintf(logout,"%d : declaration_list -> declaration_list COMMA ID\n\n",line_count);
for(int j = 1; arr[j+1] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
| declaration_list COMMA ID LTHIRD CONST_INT RTHIRD {
printf("declaration_list -> declaration_list COMMA ID LTHIRD CONST_INT RTHIRD\n");
fprintf(logout,"%d : declaration_list -> declaration_list COMMA ID LTHIRD CONST_INT RTHIRD\n",line_count);
for(int j = 1; arr[j+1] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
|ID {
printf("declaration_list -> ID\n");
fprintf(logout,"%d : declaration_list -> ID\n\n",line_count);
for(int j = 1; arr[j+1] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
|ID LTHIRD CONST_INT RTHIRD {
printf("declaration_list -> ID LTHIRD CONST_INT RTHIRD\n");
fprintf(logout,"%d : declaration_list -> ID LTHIRD CONST_INT RTHIRD\n",line_count);
for(int j = 1; arr[j+1] != NULL; j++)
{
fprintf(logout,"%s",arr[j]);
}
fprintf(logout,"\n\n");
}
;
statements : statement {
printf("statements -> statement\n");
fprintf(logout,"%d : statements : statement\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
| statements statement
;
statement : var_dec
| expression_statement
| compound_statement
| FOR LPAREN expression_statement expression_statement expression RPAREN statement
| IF LPAREN expression RPAREN statement
| WHILE LPAREN expression RPAREN statement
| PRINTLN LPAREN ID RPAREN SEMICOLON
| RETURN expression SEMICOLON {
printf("statement -> RETURN expression SEMICOLON\n");
fprintf(logout,"%d : statement : RETURN expression SEMICOLON\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
;
expression_statement : SEMICOLON
| expression SEMICOLON
;
variable : ID {
printf("variable -> ID\n");
fprintf(logout,"%d : variable : ID\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
| ID LTHIRD expression RTHIRD
;
expression : logic_expression {
printf("expression -> logic_expression\n");
fprintf(logout,"%d : expression : logic_expression\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
| variable ASSIGNOP logic_expression
;
logic_expression : rel_expression
| rel_expression LOGICOP rel_expression
;
rel_expression : simple_expression {
printf("rel_expression -> simple_expression \n");
fprintf(logout,"%d : rel_expression : simple_expression\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
| simple_expression RELOP simple_expression
;
simple_expression : term {
printf("simple_expression -> term\n");
fprintf(logout,"%d : simple_expression : term \n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
| simple_expression ADDOP term {
printf("simple_expression -> simple_expression ADDOP term\n");
fprintf(logout,"simple_expression : simple_expression ADDOP term \n\n");
fprintf(logout, "%s\n\n",yytext);
}
;
term : unary_expression {
printf("term -> unary_expression\n");
fprintf(logout,"%d : term : unary_expression\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
| term MULOP unary_expression
;
unary_expression : ADDOP unary_expression
| NOT unary_expression
| factor {
printf("unary_expression -> factor\n");
fprintf(logout,"%d : unary_expression : factor\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
;
factor : variable {
printf("factor -> variable\n");
fprintf(logout,"%d : factor : variable\n\n",line_count);
fprintf(logout, "%s\n\n",yytext);
}
| ID LPAREN argument_list RPAREN
| LPAREN expression RPAREN
| CONST_INT
| CONST_FLOAT
| variable INCOP
| variable DECOP
;
argument_list : arguments
|
;
arguments : arguments COMMA logic_expression
| logic_expression
;
%%
int main(int argc, char *argv[])
{
FILE *fp ;
int token = 0;
if((fp = fopen(argv[1],"r")) == NULL)
{
fprintf(logout,"cannot open file");
exit(1);
}
logout = fopen("log.txt","w");
yyin = fp;
yyparse();
fclose(fp);
fclose(logout);
return 0;
}
input.txt
int var(int a, int b){
return a+b;
}
output I'm getting :
type_specifier -> INT
type_specifier -> INT
parameter_list -> type_specifier ID
type_specifier -> INT
parameter_list -> parameter_list COMMA type_specifier ID
variable -> ID
factor -> variable
unary_expression -> factor
term -> unary_expression
simple_expression -> term
rel_expression -> simple_expression
expression -> logic_expression
syntax error, unexpected ID, expecting SEMICOLON
expected output is :
type_specifier -> INT
type_specifier -> INT
parameter_list -> type_specifier ID
type_specifier -> INT
parameter_list -> parameter_list COMMA type_specifier ID
variable -> ID
factor -> variable
unary_expression -> factor
term -> unary_expression
simple_expression -> term
variable -> ID
factor -> variable
unary_expression -> factor
term -> unary_expression
simple_expression : simple_expression ADDOP term
rel_expression -> simple_expression
logic_expression : rel_expression
expression -> logic_expression
statement : RETURN expression SEMICOLON
statements : statement
compound_statement : LCURL statements RCURL
func_definition : type_specifier ID LPAREN parameter_list RPAREN compound_statement
unit : func_definition
program : program unit
start : program
Adding the flex file just in case
%option noyywrap
%{
#include<stdlib.h>
#include<stdio.h>
#include "y.tab.h"
#include "SymbolTable.h"
#include "SymbolInfo.h"
#include "ScopeTable.h"
void yyerror (char *);
extern YYSTYPE yylval;
extern SymbolTable *table;
extern FILE *logout;
char *arr[100];
char *final_arr[100];
int k; //final_arr count
int i = 0; //arr count
int line_count = 1;
%}
id [a-z]*
DOUBLE (([0-9]+(\.[0-9]*)?)|([0-9]*\.[0-9]+))
newline \n
%%
{newline} {
arr[i] = "\n",final_arr[k] = arr[i];
i++; k++;
line_count++;
}
[ \t]+ {}
(([0-9]+(\.[0-9]*)?)|([0-9]*\.[0-9]+)) {
yylval.f = atof(yytext);
return DOUBLE;
}
"int" {
memset(&arr,NULL,sizeof(arr)); i = 0;
arr[i] = "int ";
final_arr[k] = "int ";
i++; k++;
return INT;
}
"float" {
memset(&arr,NULL,sizeof(arr)); i = 0;
arr[i] = "float "; final_arr[k] = "float ";
i++; k++;
return FLOAT;
}
"void" {
memset(&arr,NULL,sizeof(arr)); i = 0;
arr[i] = "void "; final_arr[k] = "void ";
i++; k++;
return VOID;
}
";" {
arr[i] = ";";final_arr[k] = ";";
i++; k++;
return SEMICOLON;}
"," {
arr[i] = ","; final_arr[k] = ",";
i++; k++;
return COMMA;
}
"(" {
arr[i] = "(";final_arr[k] = "(";
i++; k++;
return LPAREN;}
")" {
arr[i] = ")";final_arr[k] = ")";
i++; k++;
return RPAREN;}
"{" {return LCURL;}
"}" {return RCURL;}
{id} {
yylval.s = strdup(yytext);
arr[i] = strdup(yytext); final_arr[k] = strdup(yytext);
k++; i++;
for(int j = 1; arr[j] != NULL; j++)
{
//fprintf(logout,"%s", arr[j]);
//fprintf(logout,"arr [%d] %s\n ",j,arr[j]);
}
//fprintf(logout,"\n\n");
return ID;
}
%%

You seem to have spent an awful lot of effort trying to implement a way of tracing what's going on in your parser, and to little effect since the problem here is simply a missing lexer keyword rule.
You would be much better off using the built-in debugging features of flex and bison. Then your grammar and lexer would be much simpler and easier to read, and the debugging output would be more complete (and would let you trace the behaviour through the state table).
Here's a quick summary. It's a snap, really.
Add --debug to your bison command. That will cause bison to generate code to trace your parse. (If you're lazy, you can use -t -- for trace -- which is the Posix standard command-line option, and should also work with yacc, byacc, btyacc, etc., etc.)
Add the following three lines at the beginning of main, assuming that main is in your .y file:
#ifdef YYDEBUG
yydebug = 1;
#endif
For additional bonus points, you could make this assignment conditional on some command line flag.
Once you do that, you will receive the following trace output:
... snip ... Pick up the trace at the ) at the end of the parameter list
Reading a token: Next token is token RPAREN ()
Shifting token RPAREN ()
Entering state 28
Reading a token: Next token is token LCURL ()
Shifting token LCURL ()
Entering state 25
Reading a token: Next token is token ID ()
Shifting token ID ()
Entering state 44
Reading a token: Next token is token ID ()
... snip ...
Note that two IDs were returned after the curly bracket, corresponding to the tokens return and a.
You can also enable tracing in flex with flex --debug (or -d). This causes the scanner to produce an output line of the form
--accepting rule at line 85 ("return")
for every accepted token (and some other lines). You need to check the line numbers against your source code, unfortunately, but in this case you might have noticed the similarity between the above and
--accepting rule at line 85 ("b")
For additional debugging simplicity, it's worth getting into the habit writing your scanner in a way that it can be compiled independently of the parser. Then you can test your scanner by compiling it separately using the main() implementation in -lfl.
References and more debugging information:
Debugging Your Parser in the bison manual. The section on tracing includes a fully-worked example using one of the example parsers in the manual.
Also see Printing semantic values which documents the %printer declaration.
Debugging Options in the flex manual.

Related

Parser (Yacc) seems like it ignores tokens in grammar

Parsing the c-like example code, i have the following issue. Its like some tokens, like identifiers, are ignored by grammar, causing a non-reason syntax error.
Parser code :
%{
#include <stdio.h>
#include <stdlib.h>
int yylex();
void yyerror (char const *);
%}
%token T_MAINCLASS T_ID T_PUBLIC T_STATIC T_VOID T_MAIN T_PRINTLN T_INT T_FLOAT T_FOR T_WHILE T_IF T_ELSE T_EQUAL T_SMALLER T_BIGGER T_NOTEQUAL T_NUM T_STRING
%left '(' ')'
%left '+' '-'
%left '*' '/'
%left '{' '}'
%left ';' ','
%left '<' '>'
%%
PROGRAM : T_MAINCLASS T_ID '{' T_PUBLIC T_STATIC T_VOID T_MAIN '(' ')' COMP_STMT '}'
;
COMP_STMT : '{' STMT_LIST '}'
;
STMT_LIST : /* nothing */
| STMT_LIST STMT
;
STMT : ASSIGN_STMT
| FOR_STMT
| WHILE_STMT
| IF_STMT
| COMP_STMT
| DECLARATION
| NULL_STMT
| T_PRINTLN '(' EXPR ')' ';'
;
DECLARATION : TYPE ID_LIST ';'
;
TYPE : T_INT
| T_FLOAT
;
ID_LIST : T_ID ',' ID_LIST
|
;
NULL_STMT : ';'
;
ASSIGN_STMT : ASSIGN_EXPR ';'
;
ASSIGN_EXPR : T_ID '=' EXPR
;
EXPR : ASSIGN_EXPR
| RVAL
;
FOR_STMT : T_FOR '(' OPASSIGN_EXPR ';' OPBOOL_EXPR ';' OPASSIGN_EXPR ')' STMT
;
OPASSIGN_EXPR : /* nothing */
| ASSIGN_EXPR
;
OPBOOL_EXPR : /* nothing */
| BOOL_EXPR
;
WHILE_STMT : T_WHILE '(' BOOL_EXPR ')' STMT
;
IF_STMT : T_IF '(' BOOL_EXPR ')' STMT ELSE_PART
;
ELSE_PART : /* nothing */
| T_ELSE STMT
;
BOOL_EXPR : EXPR C_OP EXPR
;
C_OP : T_EQUAL | '<' | '>' | T_SMALLER | T_BIGGER | T_NOTEQUAL
;
RVAL : RVAL '+' TERM
| RVAL '-' TERM
| TERM
;
TERM : TERM '*' FACTOR
| TERM '/' FACTOR
| FACTOR
;
FACTOR : '(' EXPR ')'
| T_ID
| T_NUM
;
%%
void yyerror (const char * msg)
{
fprintf(stderr, "C-like : %s\n", msg);
exit(1);
}
int main ()
{
if(!yyparse()){
printf("Compiled !!!\n");
}
}
Part of Lexical Scanner code :
{Empty}+ { printf("EMPTY ") ; /* nothing */ }
"mainclass" { printf("MAINCLASS ") ; return T_MAINCLASS ; }
"public" { printf("PUBLIC ") ; return T_PUBLIC; }
"static" { printf("STATIC ") ; return T_STATIC ; }
"void" { printf("VOID ") ; return T_VOID ; }
"main" { printf("MAIN ") ; return T_MAIN ; }
"println" { printf("PRINTLN ") ; return T_PRINTLN ; }
"int" { printf("INT ") ; return T_INT ; }
"float" { printf("FLOAT ") ; return T_FLOAT ; }
"for" { printf("FOR ") ; return T_FOR ; }
"while" { printf("WHILE ") ; return T_WHILE ; }
"if" { printf("IF ") ; return T_IF ; }
"else" { printf("ELSE ") ; return T_ELSE ; }
"==" { printf("EQUAL ") ; return T_EQUAL ; }
"<=" { printf("SMALLER ") ; return T_SMALLER ; }
">=" { printf("BIGGER ") ; return T_BIGGER ; }
"!=" { printf("NOTEQUAL ") ; return T_NOTEQUAL ; }
{id} { printf("ID ") ; return T_ID ; }
{num} { printf("NUM ") ; return T_NUM ; }
{string} { printf("STRING ") ; return T_STRING ; }
{punct} { printf("PUNCT ") ; return yytext[0] ; }
<<EOF>> { printf("EOF ") ; return T_EOF; }
. { yyerror("lexical error"); exit(1); }
Example :
mainclass Example {
public static void main ( )
{
int c;
float x, sum, mo;
c=0;
x=3.5;
sum=0.0;
while (c<5)
{
sum=sum+x;
c=c+1;
x=x+1.5;
}
mo=sum/5;
println (mo);
}
}
Running all this stuff it showed up this output:
C-like : syntax error
MAINCLASS EMPTY ID
It seems like id is in wrong position although in grammar we have:
PROGRAM : T_MAINCLASS T_ID '{' T_PUBLIC T_STATIC T_VOID T_MAIN '(' ')' COMP_STMT '}'
Based on the "solution" proposed in OP's self answer, it's pretty clear that the original problem was that the generated header used to compile the scanner was not the same as the header generated by bison/yacc from the parser specification.
The generated header includes definitions of all the token types as small integers; in order for the scanner to communicate with the parser, it must identify each token with the correct token type. So the parser generator (bison/yacc) produces a header based on the parser specification (the .y file), and that header must be #included into the generated scanner so that scanner actions can used symbolic token type names.
If the scanner was compiled with a header file generated from some previous version of the parser specification, it is quite possible that the token numbers no longer correspond with what the parser is expecting.
The easiest way to avoid this problem is to use a build system like make, which will automatically recompile the scanner if necessary.
The easiest way to detect this problem is to use bison's built-in trace facility. Enabling tracing requires only a couple of lines of code, and saves you from having to scatter printf statements throughout your scanner and parser. The bison trace will show you exactly what is going on, so not only is it less work than adding printfs, it is also more precise. In particular, it reports every token which is passed to the parser (and, with a little more effort, you can get it to report the semantic values of those tokens as well). So if the parser is getting the wrong token code, you'll see that right away.
After many potential helpful changes, parser worked by changing the order of these tokens.
From
%token T_MAINCLASS T_ID T_PUBLIC T_STATIC T_VOID T_MAIN T_PRINTLN T_INT T_FLOAT T_FOR T_WHILE T_IF T_ELSE T_EQUAL T_SMALLER T_BIGGER T_NOTEQUAL T_NUM T_STRING
TO
%token T_MAINCLASS T_PUBLIC T_STATIC T_VOID T_MAIN T_PRINTLN T_INT T_FLOAT T_FOR T_WHILE T_IF T_EQUAL T_ID T_NUM T_SMALLER T_BIGGER T_NOTEQUAL T_ELSE T_STRING
It looked like that the reading element was else but lexer normaly returned an id. Somehow this modification was the solution.

Parser to verify declarations of type int and float in C language

I'm trying to write a parser to verify the following declarations of type int and float in C language.
variables declarations, pointer variable declarations, array of any dimensions
float a , b , r = 5, area = r * r , * b;
int a , b , c , ** p ;
int x , mat [2][3];
This is my lex file
%{
#include "y.tab.h"
extern int yylval;
%}
%%
"int" return INT;
"float" return FLOAT;
[0-9]+ return NUM;
[_|a-z|A-Z]([_|a-z|A-Z|0-9])*{1,255} return NAME;
[+\-*/] return op;
[ \t\n];
. return yytext[0];
%%
This is my yacc file
%{
#include<stdio.h>
int yylex() ;
int yyerror();
%}
%token NUM NAME op INT FLOAT
%%
stmt_list: stmt | stmt_list stmt;
stmt: type id_list ';' { printf("Valid Declaration\n"); };
type: INT | FLOAT;
id_list: id ',' id_list | id ;
id: NAME'='expr | expr;
expr: expr op expr | POINT expr | expr MATRIX | '(' expr')' | NAME;
MATRIX: '[' NUM ']' | '[' NUM ']' MATRIX ;
POINT: '*' | '*'POINT;
%%
int main(){
yyparse();
return 0;
}
int yyerror(){
printf("Invalid Declaration\n");
return -1;
}
Even if I enter "int a;" as input, I get "Invalid Declaration". I'm not able to figure out what I'm doing wrong.

How to manage semantic rule of declaration of variable in bison

I have to build a compiler that translates the java language into pyhton. I'm using the Flex and Bison tools. I created the flex file and I defined the syntactic grammar in Bison for some restrictions that I have to implement (such as array, management of cycles, management of a class, management of logical-arithmetic operators, etc.).
I'm having trouble understanding how to handle semantic rules. For example, I should handle the semantics for import statement and variable declaration, add the variable in the symbol table and then handle the translation.
This is the structure of the symbol table in the symboltable.h module:
struct symtable{
char *scopename; // key
struct symtable2 *subtable; // symble table secondary
UT_hash_handle hh; // to make the structure hash
}
struct symtable2 // secondary symbol structure
{
char *name; // Name of the symbol (key)
char *element; // it can be a variable or an array
char *type; // Indicates the type assumed by the token
(int, float, char, bool)
char *value; // value assigned to the variable
int dim; // Array size, it is zero in the case of a variable.
UT_hash_handle hh; // to make the structure hash
};
And this is the add symbol function:
void add_symbol( char *name, char *current_scopename, char *element, char *current_type, char *current_value, int dim, int nr) { //Function to add a new symbol in the symbol table
struct symtable *s;
HASH_FIND_PTR(symbols, current_scopename, s);
if (s == NULL) {
s = (struct symtable *)malloc(sizeof *s);
s->scopename =current_scopename;
s->subtable=NULL;
s->scopename =current_scopename;
HASH_ADD_KEYPTR(hh,symbols,s->scopename,strlen(s->scopename),s);
}
struct symtable2 *s2;
HASH_FIND_PTR(symbols2, name, s2);
if (s2==NULL) {
s2 = (struct symtable2 *)malloc(sizeof *s2);
s2->name = name;
s2->element = element;
s2->type = current_type;
s2->value = current_value;
s2->dim = dim;
HASH_ADD_KEYPTR(hh,s->subtable,s2->name,strlen(s2->name),s2);
} else {
if (strcmp( s2->type,current_type) == 0){
s2->value =current_value;
} else {
printf("\033[01;31mRiga %i. [FATALE] SEMANTIC ERROR: assignment violates the primitive type of the variable.\033[00m\n", nr);
printf("\n\n\033[01;31mParsing failed.\033[00m\n");
}
}
}
This is a part of the bison file with the grammar to handle import statement and the variable declaration:
%{
#include <stdio.h>;
#include <ctype.h>;
#include <symboltable.h>;
file *f_ptr;
%}
%start program
%token NUMBER
%token ID
%token INT
%token FLOAT
%token DOUBLE
%token CHAR
%token IMPORT
%right ASSIGNOP
%left SCOR
%left SCAND
%left EQ NE
%left LT GT LE GE
%left ADD SUB
%left MULT DIV MOD
%right NOT
%left '(' ')' '[' ']'
%%
program
: ImportStatement GlobalVariableDeclarations
;
ImportStatement
: IMPORT LibraryName ';' { delete_file (); f_ptr = open_file (); fprintf(fptr, "import array \n"); }
;
LibraryName
: 'java.util.*'
;
GlobalVariableFieldDeclarations
: type GlobalVariableDeclarations ';'
;
GlobalVariableDeclarations
: GlobalVariableDeclaration
| GlobalVariableDeclarations ',' GlobalVariableDeclaration
;
GlobalVariableDeclaration
: VariableName
| VariableName ASSIGNOP VariableInitializer {if (typeChecking($1,$3)== 0) {$1= $3; $$=$1;}}
;
VariableName
: ID {$$ = $1 ;}
;
type
: INT
| CHAR
| FLOAT
| DOUBLE
| BOOLEAN
;
VariableInitializers
: VariableInitializer
| VariableInitializers ',' VariableInitializer
;
VariableInitializer
: ExpressionStatement
;
ExpressionStatement
: VariableName
| NUMBER
| ArithmeticExpression
| RelationalExpression
| BooleanExpression
;
ArithmeticExpression
: ExpressionStatement ADD ExpressionStatement
| ExpressionStatement SUB ExpressionStatement
| ExpressionStatement MULT ExpressionStatement
| ExpressionStatement DIV ExpressionStatement
| ExpressionStatement MOD ExpressionStatement
;
RelationalExpression
: ExpressionStatement GT ExpressionStatement
| ExpressionStatement LT ExpressionStatement
| ExpressionStatement GE ExpressionStatement
| ExpressionStatement LE ExpressionStatement
| ExpressionStatement EQ ExpressionStatement
| ExpressionStatement NE ExpressionStatement
;
BooleanExpression
: ExpressionStatement SCAND ExpressionStatement
| ExpressionStatement SCOR ExpressionStatement
| NOT ExpressionStatement
;
%%
int yyerror (char *s)
{
printf ("%s \n",s);
}
int main (void) {
return yyparse();
}
int typeChecking (variable1, variable2) {
struct symtable2 *s2;
s2=find_symbol (scopename, variable1);
if (s2!=NULL) {
int type1= s2->type;
char element1 = s2->element;
}
else{
printf("\n\n\033[01;31mVariable 1 not defined.\033[00m\n");
return -1;
}
s2=find_symbol (scopename, variable2);
if (s2!=NULL) {
int type2= s2->type;
char element2 = s2->element;
}
else{
printf("\n\n\033[01;31mVariable 2 not defined.\033[00m\n");
return -1;
}
if(element1=='variable' && element2=='variable'){
if (type1 == type2){
return 0;
}
else {
return 1;
}
}
else {
printf("\n\n\033[01;31m Different elements.\033[00m\n");
return -1;
}
}
I am a beginner with the syntax of the bison for the management of semantics, on the following productions I have doubts about the relative semantic rule:
GlobalVariableFieldDeclarations
: type GlobalVariableDeclarations ';'
;
GlobalVariableDeclarations
: GlobalVariableDeclaration
| GlobalVariableDeclarations ',' GlobalVariableDeclaration
;
GlobalVariableDeclaration
: VariableName
| VariableName ASSIGNOP VariableInitializer {if (typeChecking($1,$3)== 0) {$1= $3; $$=$1;}}
;
VariableName
: ID {$$ = $1 ;}
;
Is it correct to manage semantics in this way for a GlobalVariableDeclaration production? And how can I insert the required parameter values, in the symbol table, via the add_symbol function? (Or better, how can I acquire the required parameters starting from productions to insert them in the add_symbol function that I have implemented?) Forgive me but I am a beginner, and many things about the semantics are not clear to me. I hope you have the patience to help me, I thank you in advance.
You should use Bison to build an AST and then you would perform semantic analysis on the tree instead of in the grammar. Building an AST allows you to perform analysis on more complex data structures then just the grammar rules you built in Bison.
Once you have your AST for the input you can then make rules for how to convert that AST into a python program with the same syntax.
Here is an example of a Bison/Flex compiler for the Decaf language that might give you some ideas https://github.com/davidcox143/Decaf-Compiler

unclear how to add extra productions to bison grammar to create error messages

This is not homework, but it is from a book.
I'm given a following bison spec file:
%{
#include <stdio.h>
#include <ctype.h>
int yylex();
int yyerror();
%}
%token NUMBER
%%
command : exp { printf("%d\n", $1); }
; /* allows printing of the result */
exp : exp '+' term { $$ = $1 + $3; }
| exp '-' term { $$ = $1 - $3; }
| term { $$ = $1; }
;
term : term '*' factor { $$ = $1 * $3; }
| factor { $$ = $1; }
;
factor : NUMBER { $$ = $1; }
| '(' exp ')' { $$ = $2; }
;
%%
int main() {
return yyparse();
}
int yylex() {
int c;
/* eliminate blanks*/
while((c = getchar()) == ' ');
if (isdigit(c)) {
ungetc(c, stdin);
scanf("%d", &yylval);
return (NUMBER);
}
/* makes the parse stop */
if (c == '\n') return 0;
return (c);
}
int yyerror(char * s) {
fprintf(stderr, "%s\n", s);
return 0;
} /* allows for printing of an error message */
The task is to do the following:
Rewrite the spec to add the following useful error messages:
"missing right parenthesis," generated by the string (2+3
"missing left parenthesis," generated by the string 2+3)
"missing operator," generated by the string 2 3
"missing operand," generated by the string (2+)
The simplest solution that I was able to come up with is to do the following:
half_exp : exp '+' { $$ = $1; }
| exp '-' { $$ = $1; }
| exp '*' { $$ = $1; }
;
factor : NUMBER { $$ = $1; }
| '(' exp '\n' { yyerror("missing right parenthesis"); }
| exp ')' { yyerror("missing left parenthesis"); }
| '(' exp '\n' { yyerror("missing left parenthesis"); }
| '(' exp ')' { $$ = $2; }
| '(' half_exp ')' { yyerror("missing operand"); exit(0); }
;
exp : exp '+' term { $$ = $1 + $3; }
| exp '-' term { $$ = $1 - $3; }
| term { $$ = $1; }
| exp exp { yyerror("missing operator"); }
;
These changes work, however they lead to a lot of conflicts.
Here is my question.
Is there a way to rewrite this grammar in such a way so that it wouldn't generate conflicts?
Any help is appreciated.
Yes it is possible:
command : exp { printf("%d\n", $1); }
; /* allows printing of the result */
exp: exp '+' exp {
// code
}
| exp '-' exp {
// code
}
| exp '*' exp {
// code
}
| exp '/' exp {
// code
}
|'(' exp ')' {
// code
}
Bison allows Ambiguous grammars.
I don't see how can you rewrite grammar to avoid conflicts. You just missed the point of terms, factors etc. You use these when you want left recursion context free grammar.
From this grammar:
E -> E+T
|T
T -> T*F
|F
F -> (E)
|num
Once you free it from left recursion you would go to:
E -> TE' { num , ( }
E' -> +TE' { + }
| eps { ) , EOI }
T -> FT' { ( , num }
T' -> *FT' { * }
|eps { + , ) , EOI }
F -> (E) { ( }
|num { num }
These sets alongside rules are showing what input character has to be in order to use that rule. Of course this is just example for simple arithmetic expressions for example 2*(3+4)*5+(3*3*3+4+5*6) etc.
If you want to learn more about this topic I suggest you to read about "left recursion context free grammar". There are some great books covering this topic and also covering how to get input sets.
But as I said above, all of this can be avoided because Bison allows Ambiguous grammars.

ANTLR 3 bug, mismatched input, but what's wrong?

I have the following problem:
My ANTLR 3 grammar compiles, but my simple testprogram doesn't work. The grammar is as follows:
grammar Rietse;
options {
k=1;
language=Java;
output=AST;
}
tokens {
COLON = ':' ;
SEMICOLON = ';' ;
OPAREN = '(' ;
CPAREN = ')' ;
COMMA = ',' ;
OCURLY = '{' ;
CCURLY = '}' ;
SINGLEQUOTE = '\'' ;
// operators
BECOMES = '=' ;
PLUS = '+' ;
MINUS = '-' ;
TIMES = '*' ;
DIVIDE = '/' ;
MODULO = '%' ;
EQUALS = '==' ;
LT = '<' ;
LTE = '<=' ;
GT = '>' ;
GTE = '>=' ;
UNEQUALS = '!=' ;
AND = '&&' ;
OR = '||' ;
NOT = '!' ;
// keywords
PROGRAM = 'program' ;
COMPOUND = 'compound' ;
UNARY = 'unary' ;
DECL = 'decl' ;
SDECL = 'sdecl' ;
STATIC = 'static' ;
PRINT = 'print' ;
READ = 'read' ;
IF = 'if' ;
THEN = 'then' ;
ELSE = 'else' ;
DO = 'do' ;
WHILE = 'while' ;
// types
INTEGER = 'int' ;
CHAR = 'char' ;
BOOLEAN = 'boolean' ;
TRUE = 'true' ;
FALSE = 'false' ;
}
#lexer::header {
package Eindopdracht;
}
#header {
package Eindopdracht;
}
// Parser rules
program
: program2 EOF
-> ^(PROGRAM program2)
;
program2
: (declaration* statement)+
;
declaration
: STATIC type IDENTIFIER SEMICOLON -> ^(SDECL type IDENTIFIER)
| type IDENTIFIER SEMICOLON -> ^(DECL type IDENTIFIER)
;
type
: INTEGER
| CHAR
| BOOLEAN
;
statement
: assignment_expr SEMICOLON!
| while_stat SEMICOLON!
| print_stat SEMICOLON!
| if_stat SEMICOLON!
| read_stat SEMICOLON!
;
while_stat
: WHILE^ OPAREN! or_expr CPAREN! OCURLY! statement+ CCURLY! // while (expression) {statement+}
;
print_stat
: PRINT^ OPAREN! or_expr (COMMA! or_expr)* CPAREN! // print(expression)
;
read_stat
: READ^ OPAREN! IDENTIFIER (COMMA! IDENTIFIER)+ CPAREN! // read(expression)
;
if_stat
: IF^ OPAREN! or_expr CPAREN! comp_expr (ELSE! comp_expr)? // if (expression) compound else compound
;
assignment_expr
: or_expr (BECOMES^ or_expr)*
;
or_expr
: and_expr (OR^ and_expr)*
;
and_expr
: compare_expr (AND^ compare_expr)*
;
compare_expr
: plusminus_expr ((LT|LTE|GT|GTE|EQUALS|UNEQUALS)^ plusminus_expr)?
;
plusminus_expr
: timesdivide_expr ((PLUS | MINUS)^ timesdivide_expr)*
;
timesdivide_expr
: unary_expr ((TIMES | DIVIDE | MODULO)^ unary_expr)*
;
unary_expr
: operand
| PLUS operand -> ^(UNARY PLUS operand)
| MINUS operand -> ^(UNARY MINUS operand)
| NOT operand -> ^(UNARY NOT operand)
;
operand
: TRUE
| FALSE
| charliteral
| IDENTIFIER
| NUMBER
| OPAREN! or_expr CPAREN!
;
comp_expr
: OCURLY program2 CCURLY -> ^(COMPOUND program2)
;
// Lexer rules
charliteral
: SINGLEQUOTE! LETTER SINGLEQUOTE!
;
IDENTIFIER
: LETTER (LETTER | DIGIT)*
;
NUMBER
: DIGIT+
;
COMMENT
: '//' .* '\n'
{ $channel=HIDDEN; }
;
WS
: (' ' | '\t' | '\f' | '\r' | '\n')+
{ $channel=HIDDEN; }
;
fragment DIGIT : ('0'..'9') ;
fragment LOWER : ('a'..'z') ;
fragment UPPER : ('A'..'Z') ;
fragment LETTER : LOWER | UPPER ;
// EOF
I then use the following java file to test programs:
package Package;
import java.io.FileInputStream;
import java.io.InputStream;
import org.antlr.runtime.ANTLRInputStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.tree.BufferedTreeNodeStream;
import org.antlr.runtime.tree.CommonTree;
import org.antlr.runtime.tree.CommonTreeNodeStream;
import org.antlr.runtime.tree.DOTTreeGenerator;
import org.antlr.runtime.tree.TreeNodeStream;
import org.antlr.stringtemplate.StringTemplate;
public class Rietse {
public static void main (String[] args)
{
String inputFile = args[0];
try {
InputStream in = inputFile == null ? System.in : new FileInputStream(inputFile);
RietseLexer lexer = new RietseLexer(new ANTLRInputStream(in));
CommonTokenStream tokens = new CommonTokenStream(lexer);
RietseParser parser = new RietseParser(tokens);
RietseParser.program_return result = parser.program();
} catch (RietseException e) {
System.err.print("ERROR: RietseException thrown by compiler: ");
System.err.println(e.getMessage());
} catch (RecognitionException e) {
System.err.print("ERROR: recognition exception thrown by compiler: ");
System.err.println(e.getMessage());
e.printStackTrace();
} catch (Exception e) {
System.err.print("ERROR: uncaught exception thrown by compiler: ");
System.err.println(e.getMessage());
e.printStackTrace();
}
}
}
And at last, the testprogram itself:
print('a');
Now when I run this, I get the following errors:
line 1:7 mismatched input 'a' expecting LETTER
line 1:9 mismatched input ')' expecting LETTER
I have no clue whatsoever what causes this bug. I have tried several changes of things but nothing fixed it. Does anyone here know what's wrong with my code and how I can fix it?
Every bit of help is greatly appreciated, thanks in advance.
Greetings,
Rien
Using a rule:
CHARLITERAL
: SINGLEQUOTE (LETTER | DIGIT) SINGLEQUOTE
;
and changing operand to:
operand
: TRUE
| FALSE
| CHARLITERAL
| IDENTIFIER
| NUMBER
| OPAREN! or_expr CPAREN!
;
will fix the problem. It does give the problem of having singlequotes in the AST, but that can be fixed optionally by changing the text of the node with the
setText(String);
method.
Turn charliteral into a lexer rule (rename it to CHARLITERAL). Right now, the string 'a' is tokenized like this: SINGLEQUOTE IDENTIFIER SINGLEQUOTE, so you're getting an IDENTIFIER instead of a LETTER.
I wonder how this code can compile at all given that you're using a fragment (LETTER) from a parser rule.

Resources