How can I implement #include constructs with Flex and YACC? - parsing

During parsing, if I encounter a include token I want to instruct YACC to open the file specified as input and to begin parsing this. Once this parsing is finished, I want to instruct YACC to return to the file and continue parsing directly after the include expression. I will restrict the include depth level to one.

The flex manual covers how to do this using yypush_buffer_state() and yypop_buffer_state().
Here is the section of the official manual on using multiple input buffers. There is some sample code.

It's normal to communicate between the lexical and syntactic phases of your processor.
So, recognize the syntax for an include directive in your parser (or, to make things easier, just recognize it in the lexer) and do the switching at the lexical level.
For example, here is a simple language that recognizes standard input lines containing ab or cd or .file. When it sees .someString it opens someString as an include file and then goes back to reading standard input.
%{
#include <stdio.h>
#include <stdlib.h>
void start_include(char *); int yylex(void); void yyerror(void *);
#define YYSTYPE char *
%}
%%
all: all others | others;
others: include_rule | rule_1 | rule_2 | 'Z' { YYACCEPT; };
include_rule: '.' '\n' { start_include($1); };
rule_1: 'a' 'b' '\n' { printf("random rule 1\n"); };
rule_2: 'c' 'd' '\n' { printf("random rule 2\n"); };
%%
FILE * f = NULL;
void start_include(char *s) {
if ((f = fopen(s, "r")) == NULL)
abort();
}
int yylex(void) {
int c;
static char s[100];
if (f == NULL)
f = stdin;
c = getc(f);
if (c == EOF) {
f = stdin;
c = getc(f);
}
if (c == '.') {
scanf(" %s", s);
yylval = s;
} else if (c == EOF)
return 'Z';
return c;
}
And when we run it...
$ cat > toplevel
ab
.myinclude
ab
$ cat > myinclude
cd
cd
$ yacc ip.y && cc -Wall y.tab.c -ly && ./a.out < toplevel
random rule 1
random rule 2
random rule 2
random rule 1
$

Related

How do I use %empty correctly?

I'm new to bison/fex and I am trying to recognize patterns of 1-3 words of input.
My .l recognizes WORD as any series of lowercase characters, treats % and $ characters as tokens of their own ascii value, ignores whitespace, and recognizes everything else as an ERR token.
MISC [\%\$]
%%
[a-z]+ { yylval.WORD = yytext; return WORD; }
[ \t\r\n]+ {} //ignore whitespace
{MISC} return (int) yytext[0];
. return ERR; //unrecognized input
My .y tries to recognize sequences of 1-3 WORDS separated by characters % and $. I want it so that even if I input just 1 WORD, I still get a complete statement. I don't include any rules for the ERR token to invoke a syntax error in the parser when an unrecognize character is received as input.
%{
#include <stdio.h>
int yylex();
void yyerror(char* s){
fprintf(stderr, "%s\n", s);
};
%}
%define api.value.type union
%token <char*> WORD
%nterm <char*> word1 word2 word3
%token ERR
%%
statement: word1 word2 word3 { printf("%s, %s, %s\n", $1, $2, $3); return 0; }
;
word1: WORD { $$ = $1; }
;
word2: %empty { $$ = "nothing"; }
| '%' WORD { $$ = $2; }
;
word3: %empty { $$ = "nothing";}
| '$' WORD { $$ = $2; }
;
%%
My main.c loops on yyparse(). Ideally I'm trying to parse only 1 line of input per iteration.
#include <unistd.h>
#include <stdio.h>
int yyparse();
int yylex();
extern FILE* yyin;
extern void yyrestart();
int main() {
while(1) {
printf("input: ");
if (yyparse() == 0) {
printf("success\n");
};
yyrestart(yyin);
}
return 0;
}
However, I am getting adverse output, and I can't explain what's causing it:
In: word word -> Out: word word, nothing, nothing
In: word word word -> Out: word word, nothing, nothing
In: word % word $ word -> Out: word % word $ word, word $ word, word
Additionally, if I input only 1 WORD, my command line hangs until it receives additional input. This additional input could be anything. Even characters that would normally be recognized as an ERR token and invoke a syntax error somehow get by.
In: word. -> Out: word., nothing, nothing
I want my parser to be able to run even if I give it only 1 word of input. I thought that with the inclusion of %empty for subrules word2 and word3 I would get this behavior, but I am not sure what I am doing wrong.

FLEX/YACC program not behaving as expected : can't grab int value from sequence of ints

I am trying to build a parser that takes a list of strings in the following format and performs either an addition or multiplication of all of its elements :
prod 5-6_
sum _
sum 5_
sum 5-6-7_
$
Should print the following to the screen :
prod = 30
sum = 0
sum = 5
sum = 18
What I am actually getting as output is this :
prod = 0
sum = 0
sum = 5
sum = 5
My lex file looks like this :
%{
#include <iostream>
#include "y.tab.h"
using namespace std;
extern "C" int yylex();
%}
%option yylineno
digit [0-9]
integer {digit}+
operator "sum"|"prod"
%%
{integer} { return number; }
{operator} { return oper; }
"-" { return '-'; }
"_" { return '_'; }
"$" { return '$'; }
\n { ; }
[\t ]+ { ; }
. { cout << "unknown char" << endl; }
%%
and my yacc file looks like this :
%token oper
%token number
%token '-'
%token '_'
%token '$'
%start valid
%{
#include <iostream>
#include <string>
#include <cstdio>
#include <cstdlib>
using namespace std;
#define YYSTYPE int
extern FILE *yyin;
extern char yytext[];
extern "C" int yylex();
int yyparse();
extern int yyerror(char *);
char op;
%}
%%
valid : expr_seq endfile {}
| {}
;
expr_seq : expr {}
| expr_seq expr {}
;
expr : op sequence nl {if (op == '+') cout << "sum = " ; else cout << "prod = ";}
| op nl {if (op == '+') cout << "sum = 0"; else cout <<"prod = 1";}
;
op : oper { if (yytext[0] == 's') op = '+'; else op = '*';}
;
sequence : number { $$ = atoi(yytext);}
| sequence '-' number { if (op == '+') $$ = $1 + $3; else $$ = $1 * $3;}
;
nl : '_' { cout << endl;}
;
endfile : '$' {}
;
%%
int main(int argc, char *argv[])
{
++argv, --argc;
if(argc > 0) yyin = fopen(argv[0], "r");
else yyin = stdin;
yyparse();
return 0;
}
int yyerror(char * msg)
{
extern int yylineno;
cerr << msg << "on line # " << yylineno << endl;
return 0;
}
My reasoning for the yacc logic is as follows :
a file is valid only if it contains a sequence of expressions followed by the endfile symbol.
a sequence of expressions is a single expression or several expressions.
an expression is either an operator followed by a new line, OR an operator, followed by a list of numbers, followed by a new line symbol.
an operator is either 'sum' or 'prod'
a list of numbers is either a number or several numbers separated by the '-' symbol.
From my perspective this should work, but for some reason it doesn't interpret the sequence of numbers properly after the first element. Any tips would be helpful.
Thanks
You must not use yytext in your yacc actions. yytext is only valid during a scanner action, and the parser often reads ahead to the next token. (In fact, yacc always reads the next token. Bison sometimes doesn't, but it's not always easily predictable.)
You can associate a semantic value with every token (and non-terminal), and you can reference these semantic values using $1, $2, etc. in your yacc actions. You can even associate semantic values of different types to different grammar symbols. And if you use bison -- and you probably are using bison -- you can give grammar symbols names to make it easier to refer to their semantic values.
This is all explained in depth, with examples, in the bison manual.
The solution that worked was simply to change the following lines :
sequence : number { $$ = atoi(yytext);}
| sequence '-' number { if (op == '+') $$ = $1 + $3; else $$ = $1 * $3;}
;
to this :
sequence : number { $$ = atoi(yytext);}
| sequence '-' number { if (op == '+') $$ = $1 + atoi(yytext); else $$ = $1 * atoi(yytext);}
;

GNU Bison outputting error "syntax error, unexpected string, expecting ="

I've been trying to compile my Bison code and it seems to be that something is wrong with my code and yet I just can't figure out why or where.
Here is my bison code, I am running GNU Bison 2.3 on OSX.
The error I am receiving is:
romans.y:9.9-21: syntax error, unexpected string, expecting =
This is an error I do not appear to be receiving on my Linux machine but on the OSX machine
%{
// file created via echo
# include <stdio.h>
# include <stdlib.h>
int yyerror(char *s);
int yylex();
int yyparse();
%}
%output "roman.tab.c"
%token ARABIC_NUMERAL;
%token EOL
%%
calclist: /* nothing */ {}
| calclist arabic_numerals EOL { printf("%d\n", $2); }
;
arabic_numerals: ARABIC_NUMERAL
| ARABIC_NUMERAL { $$ = $$ + $2; }
;
/* ones:
| ONE {$$ = 1;}
| ONE ONE {$$ = 2;}
| ONE ONE ONE {$$ = 3;}
;
fives:
| FOUR {$$ = 4;}
| FIVE {$$ = 5;}
| FIVE ones { $$ = 5 +$2;}
;
tens:
| TEN {$$ = 10;}
| TEN TEN { $$ = 20;}
| TEN TEN TEN { $$ = 30;}
| TEN fives { $$ = 10 + $2}
| NINE { $$ = 9}
;
fifties:
| FIFTY { $$ = 50;}
|
:*/
%%
void yyerror(char *s)
{
printf("error: %s\n", s);
exit(0);
}
int
main()
{
// yydebug = 1;
yyparse();
return 0;
}
I have based my code off a program given to me by my professor, which is the following. When I attempted to compile it myself, I have the exact same issue. Is it a problem with the version of bison on my system?
%{
# include <stdio.h>
# include <stdlib.h>
void yyerror(char *s);
int yylex();
int yyparse();
%}
%output "brackets.c"
%token OP CP N EOL
%%
calclist: /* nothing */ {}
| calclist expr EOL { printf("Input conforms to grammar\n"); }
;
//expr: N N N { }
//;
expr: OP expr CP
| N
;
%%
void yyerror(char *s)
{
printf("error: %s\n", s);
}
int
main()
{
// yydebug = 1;
yyparse();
return 0;
}
You should update your bison version. The one which comes by default on OS X is ancient and lacking many features.
In that version (but not 2.4 or later) the syntax of the %output directive had an equals sign:
%output="roman.tab.c"
You could make that change but then your file won't work on your other machine, or on anybody else's machine, such as the ones at your school. You can also set the output filename when you run the bison command:
bison -d -o roman.tab.c roman.y
which avoids the need for the %output directive and will work on all versions of bison.
But on the whole, upgrading is probably your best option.
Note that updating Bison on macOS can be tricky. The default system Bison in the Xcode toolchain (/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/bison) is at 2.3 as of 10.14 Mojave, which, as #rici explained, does not support the %output "brackets.c" syntax (it expects %output="roman.tab.c", thus the = reference in the error message).
In order to update Bison in a way that it is both in your path and in your compiler's path, you need to forcefully symlink it after installing it via Homebrew (Homebrew requires Java 8 specifically in order to install Bison):
brew cask install homebrew/cask-versions/adoptopenjdk8 # Homebrew Bison requires Java8
brew install bazel bison flex
# So that the system can find the new brew Bison instead of the old system Bison.
brew link bison --force
echo 'export PATH="/usr/local/opt/bison/bin:$PATH"' >> ~/.bash_profile
export LDFLAGS="-L/usr/local/opt/bison/lib"

flex and bison issue for real numbers

I have been using flex and bison for making a small calculator. My files are the following:
bisonFile.y
%{
#include <stdio.h>
%}
/* declare tokens */
%token NUMBER
%token ADD SUB MUL DIV ABS
%token EOL
%%
calclist: /* nothing */
| calclist exp EOL { printf("= %d\n", $2); }
;
exp: factor
| exp ADD factor { $$ = $1 + $3; }
| exp SUB factor { $$ = $1 - $3; }
;
factor: term
| factor MUL term { $$ = $1 * $3; }
| factor DIV term { $$ = $1 / $3; }
;
term: NUMBER
| ABS term { $$ = $2 >= 0? $2 : - $2; }
;
%%
main(int argc, char **argv)
{
yyparse();
}
yyerror(char *s)
{
fprintf(stderr, "error: %s\n", s);
}
flexFile.l
%{
# include "f5.tab.h"
int yylval;
%}
/* reconocimiento de tokens e impresion */
%{
int yylval;
%}
%option noyywrap
%%
"+" { return ADD; }
"-" { return SUB; }
"*" { return MUL; }
"/" { return DIV; }
"|" { return ABS; }
[0-9]+("."[0-9]+)? { yylval = atoi(yytext); return NUMBER; } //part added
\n { return EOL; }
[ \t] { /* ignore whitespace */ }
. { printf("Mystery character %c\n", *yytext); }
%%
My program works fine with integer numbers, and it also recognizes real numbers, but the problem is that when I print the results of an operation it always return the answer as an integer number. Why is that?
Thanks
Your use of atoi in the production converts the string to an integer.
Using atof will convert it to a floating point number.
If you want to separate the two, you'll need to change the matching rule for integers, and add one for floating point.
Change "%d" → "%f" in the file “bisonFile.y”. This uses a floating point format for printing the result. The fixed line should read:
| calclist exp EOL { printf("= %f\n", $2); }
In the file “flexFile.l” remove both definitions int yylval. bison outputs
YYSTYPE yylval;
automatically. YYSTYPE is the type of the semantic values. Because you want a floating point calculator, this shall be double. Note that YYSTYPE defaults to int. To change that, YYSTYPE must be defined when compiling the C-codes (from bison and flex) (see below).
Finally, as already stated by MIS, replace atoi() → atof(). The edited line in flexFile.l should read:
[0-9]+("."[0-9]+)? { yylval = atof(yytext); return NUMBER; }
For a novice the dependencies between flex and bison sources might be confusing. A minimal Makefile documents how the example can be compiled. Line 2 sets the semantic type for the scanner and the parser consistently:
calc: calc.o l.o
calc.o l.o: CFLAGS+=-DYYSTYPE=double
l.o: l.c f5.tab.h
calc.c f5.tab.h: bisonFile.y
bison -o $# --defines=f5.tab.h $^
l.c: flexFile.l f5.tab.h
flex -o $# $^
clean::
$(RM) calc calc.o calc.c f5.tab.h l.o l.c
That’ll do the trick.

Making YACC output an AST (token tree)

Is it possible to make YACC (or I'm my case MPPG) output an Abstract Syntax Tree (AST).
All the stuff I'm reading suggests its simple to make YACC do this, but I'm struggling to see how you know when to move up a node in the tree as your building it.
Expanding on Hao's point and from the manual, you want to do something like the following:
Assuming you have your abstract syntax tree with function node which creates an object in the tree:
expr : expr '+' expr
{
$$ = node( '+', $1, $3 );
}
This code translates to "When parsing expression with a plus, take the left and right descendants $1/$3 and use them as arguments to node. Save the output to $$ (the return value) of the expression.
$$ (from the manual):
To return a value, the action normally
sets the pseudovariable ``$$'' to some
value.
Have you looked at the manual (search for "parse tree" to find the spot)? It suggests putting node creation in an action with your left and right descendants being $1 and $3, or whatever they may be. In this case, yacc would be moving up the tree on your behalf rather than your doing it manually.
The other answers propose to modify the grammar, this isn't doable when playing with a C++ grammer (several hundred of rules..)
Fortunately, we can do it automatically, by redefining the debug macros.
In this code, we are redefining YY_SYMBOL_PRINT actived with YYDEBUG :
%{
typedef struct tree_t {
struct tree_t **links;
int nb_links;
char* type; // the grammar rule
};
#define YYDEBUG 1
//int yydebug = 1;
tree_t *_C_treeRoot;
%}
%union tree_t
%start program
%token IDENTIFIER
%token CONSTANT
%left '+' '-'
%left '*' '/'
%right '^'
%%
progam: exprs { _C_treeRoot = &$1.t; }
|
| hack
;
exprs:
expr ';'
| exprs expr ';'
;
number:
IDENTIFIER
| '-' IDENTIFIER
| CONSTANT
| '-' CONSTANT
;
expr:
number
| '(' expr ')'
| expr '+' expr
| expr '-' expr
| expr '*' expr
| expr '/' expr
| expr '^' expr
;
hack:
{
// called at each reduction in YYDEBUG mode
#undef YY_SYMBOL_PRINT
#define YY_SYMBOL_PRINT(A,B,C,D) \
do { \
int n = yyr2[yyn]; \
int i; \
yyval.t.nb_links = n; \
yyval.t.links = malloc(sizeof *yyval.t.links * yyval.t.nb_links);\
yyval.t.str = NULL; \
yyval.t.type = yytname[yyr1[yyn]]; \
for (i = 0; i < n; i++) { \
yyval.t.links[i] = malloc(sizeof (YYSTYPE)); \
memcpy(yyval.t.links[i], &yyvsp[(i + 1) - n], sizeof(YYSTYPE)); \
} \
} while (0)
}
;
%%
#include "lexer.c"
int yyerror(char *s) {
printf("ERROR : %s [ligne %d]\n",s, num_ligne);
return 0;
}
int doParse(char *buffer)
{
mon_yybuffer = buffer;
tmp_buffer_ptr = buffer;
tree_t *_C_treeRoot = NULL;
num_ligne = 1;
mon_yyptr = 0;
int ret = !yyparse();
/////////****
here access and print the tree from _C_treeRoot
***///////////
}
char *tokenStrings[300] = {NULL};
char *charTokenStrings[512];
void initYaccTokenStrings()
{
int k;
for (k = 0; k < 256; k++)
{
charTokenStrings[2*k] = (char)k;
charTokenStrings[2*k+1] = 0;
tokenStrings[k] = &charTokenStrings[2*k];
}
tokenStrings[CONSTANT] = "CONSTANT";
tokenStrings[IDENTIFIER] = "IDENTIFIER";
extern char space_string[256];
for (k = 0; k < 256; k++)
{
space_string[k] = ' ';
}
}
the leafs are created just before the RETURN in the FLEX lexer

Resources