error : unrecognized rule in flex tool - flex-lexer

this error : line 32: unrecognized rule is appearing in my code in this line :
{OCTAL} printf("Kind= OCTAL, Word= %s, LineNumber= %d\n",yytext,LineNum);
and i think the problem is here :
OCTAL "\"(0|[1-3][0-7][0-7]|[1-7][0-7]|[1-7])
this is my code :
%{
int LineNum=1;
int count =0;
%}
%x OneLineComment MultipleLinesComment
DIGIT [0-9]
INT [+|-]?(0|[1-9][0-9]*)
OCTAL "\"(0|[1-3][0-7][0-7]|[1-7][0-7]|[1-7]) // i think the error is here !!
DOUBLE {INT}"."{DIGIT}*((E|e)?[+|-]?[1-9][0-9]*)?
KEYWORD int|double|while|if|else
IDENTIFIER [_A-Za-z]+[_A-Za-z0-9]*
Space " "
Tab "\t"
NewLine "\n"
%%
{INT} printf("Kind= INTEGER, Word= %s, LineNumber= %d\n",yytext,LineNum);
{OCTAL} printf("Kind= OCTAL, Word= %s, LineNumber= %d\n",yytext,LineNum); // the error line
{DOUBLE} printf("Kind= DOUBLE, Word= %s, LineNumber= %d\n",yytext,LineNum);
{KEYWORD} printf("Kind= KEWORD, Word= %s, LineNumber= %d\n",yytext,LineNum);
{IDENTIFIER} printf("Kind= IDENTIFIER, Word= %s, LineNumber= %d\n",yytext,LineNum);
"//" BEGIN(OneLineComment);
<OneLineComment>.
<OneLineComment>"\n" {
printf("Kind= OneLineComment, LineNumber= %d\n",LineNum);
printf("Kind= WhiteSpace, Word= NewLine, LineNumber= %d\n",LineNum);
BEGIN(INITIAL);
LineNum++;
}
"/*" { count=LineNum;
BEGIN(MultipleLinesComment);
}
<MultipleLinesComment>.
<MultipleLinesComment>"\n" LineNum++;
<MultipleLinesComment>"*/" {
printf("Kind= MultipleLinesComment, LineNumber= %d,ToLineNumber= %d\n",count,LineNum);
count=0;
BEGIN(INITIAL);
}
{Space} printf("Kind= WhiteSpace, Word= Space, LineNumber= %d\n",LineNum);
{Tab} printf("Kind= WhiteSpace, Word= Tab, LineNumber= %d\n",LineNum);
{NewLine} {
printf("Kind= WhiteSpace, Word= NewLine, LineNumber= %d\n",LineNum);
LineNum++;
}
. printf("Kind= wrongToken, Word= %s, LineNumber= %d\n",yytext,LineNum);
%%
int yywrap()
{
return 1;
}
int main()
{
yylex();
return 0;
}
and when I write it in this way the error goes away :
OCTAL "\\"(0|[1-3][0-7][0-7]|[1-7][0-7]|[1-7])
What am i doing wrong?

\" tells flex to match a "
\\ tells flex to match a \
In your original code OCTAL is an illegal identifier because it has an unterminated string. You are "escaping" the second quotation mark by only having a single slash.
http://en.wikipedia.org/wiki/Escape_sequences_in_C

Related

How do I use %empty correctly?

I'm new to bison/fex and I am trying to recognize patterns of 1-3 words of input.
My .l recognizes WORD as any series of lowercase characters, treats % and $ characters as tokens of their own ascii value, ignores whitespace, and recognizes everything else as an ERR token.
MISC [\%\$]
%%
[a-z]+ { yylval.WORD = yytext; return WORD; }
[ \t\r\n]+ {} //ignore whitespace
{MISC} return (int) yytext[0];
. return ERR; //unrecognized input
My .y tries to recognize sequences of 1-3 WORDS separated by characters % and $. I want it so that even if I input just 1 WORD, I still get a complete statement. I don't include any rules for the ERR token to invoke a syntax error in the parser when an unrecognize character is received as input.
%{
#include <stdio.h>
int yylex();
void yyerror(char* s){
fprintf(stderr, "%s\n", s);
};
%}
%define api.value.type union
%token <char*> WORD
%nterm <char*> word1 word2 word3
%token ERR
%%
statement: word1 word2 word3 { printf("%s, %s, %s\n", $1, $2, $3); return 0; }
;
word1: WORD { $$ = $1; }
;
word2: %empty { $$ = "nothing"; }
| '%' WORD { $$ = $2; }
;
word3: %empty { $$ = "nothing";}
| '$' WORD { $$ = $2; }
;
%%
My main.c loops on yyparse(). Ideally I'm trying to parse only 1 line of input per iteration.
#include <unistd.h>
#include <stdio.h>
int yyparse();
int yylex();
extern FILE* yyin;
extern void yyrestart();
int main() {
while(1) {
printf("input: ");
if (yyparse() == 0) {
printf("success\n");
};
yyrestart(yyin);
}
return 0;
}
However, I am getting adverse output, and I can't explain what's causing it:
In: word word -> Out: word word, nothing, nothing
In: word word word -> Out: word word, nothing, nothing
In: word % word $ word -> Out: word % word $ word, word $ word, word
Additionally, if I input only 1 WORD, my command line hangs until it receives additional input. This additional input could be anything. Even characters that would normally be recognized as an ERR token and invoke a syntax error somehow get by.
In: word. -> Out: word., nothing, nothing
I want my parser to be able to run even if I give it only 1 word of input. I thought that with the inclusion of %empty for subrules word2 and word3 I would get this behavior, but I am not sure what I am doing wrong.

FLEX/YACC program not behaving as expected : can't grab int value from sequence of ints

I am trying to build a parser that takes a list of strings in the following format and performs either an addition or multiplication of all of its elements :
prod 5-6_
sum _
sum 5_
sum 5-6-7_
$
Should print the following to the screen :
prod = 30
sum = 0
sum = 5
sum = 18
What I am actually getting as output is this :
prod = 0
sum = 0
sum = 5
sum = 5
My lex file looks like this :
%{
#include <iostream>
#include "y.tab.h"
using namespace std;
extern "C" int yylex();
%}
%option yylineno
digit [0-9]
integer {digit}+
operator "sum"|"prod"
%%
{integer} { return number; }
{operator} { return oper; }
"-" { return '-'; }
"_" { return '_'; }
"$" { return '$'; }
\n { ; }
[\t ]+ { ; }
. { cout << "unknown char" << endl; }
%%
and my yacc file looks like this :
%token oper
%token number
%token '-'
%token '_'
%token '$'
%start valid
%{
#include <iostream>
#include <string>
#include <cstdio>
#include <cstdlib>
using namespace std;
#define YYSTYPE int
extern FILE *yyin;
extern char yytext[];
extern "C" int yylex();
int yyparse();
extern int yyerror(char *);
char op;
%}
%%
valid : expr_seq endfile {}
| {}
;
expr_seq : expr {}
| expr_seq expr {}
;
expr : op sequence nl {if (op == '+') cout << "sum = " ; else cout << "prod = ";}
| op nl {if (op == '+') cout << "sum = 0"; else cout <<"prod = 1";}
;
op : oper { if (yytext[0] == 's') op = '+'; else op = '*';}
;
sequence : number { $$ = atoi(yytext);}
| sequence '-' number { if (op == '+') $$ = $1 + $3; else $$ = $1 * $3;}
;
nl : '_' { cout << endl;}
;
endfile : '$' {}
;
%%
int main(int argc, char *argv[])
{
++argv, --argc;
if(argc > 0) yyin = fopen(argv[0], "r");
else yyin = stdin;
yyparse();
return 0;
}
int yyerror(char * msg)
{
extern int yylineno;
cerr << msg << "on line # " << yylineno << endl;
return 0;
}
My reasoning for the yacc logic is as follows :
a file is valid only if it contains a sequence of expressions followed by the endfile symbol.
a sequence of expressions is a single expression or several expressions.
an expression is either an operator followed by a new line, OR an operator, followed by a list of numbers, followed by a new line symbol.
an operator is either 'sum' or 'prod'
a list of numbers is either a number or several numbers separated by the '-' symbol.
From my perspective this should work, but for some reason it doesn't interpret the sequence of numbers properly after the first element. Any tips would be helpful.
Thanks
You must not use yytext in your yacc actions. yytext is only valid during a scanner action, and the parser often reads ahead to the next token. (In fact, yacc always reads the next token. Bison sometimes doesn't, but it's not always easily predictable.)
You can associate a semantic value with every token (and non-terminal), and you can reference these semantic values using $1, $2, etc. in your yacc actions. You can even associate semantic values of different types to different grammar symbols. And if you use bison -- and you probably are using bison -- you can give grammar symbols names to make it easier to refer to their semantic values.
This is all explained in depth, with examples, in the bison manual.
The solution that worked was simply to change the following lines :
sequence : number { $$ = atoi(yytext);}
| sequence '-' number { if (op == '+') $$ = $1 + $3; else $$ = $1 * $3;}
;
to this :
sequence : number { $$ = atoi(yytext);}
| sequence '-' number { if (op == '+') $$ = $1 + atoi(yytext); else $$ = $1 * atoi(yytext);}
;

yacc/lex parser not catching certain terminals

I built a scanner->parser meant to catch modified Java. When testing it, I noticed that codeBlock never triggers but varDecls triggers. I'm not entirely sure why this happens.
Here is my parser
%{
#include <stdio.h>
extern int yylex(void);
void yyerror(char *s) {
fprintf(stderr, "error: %s\n", s);
}
int yywrap(){ return 1;}
%}
%token INT FLOAT STRING
%union {
int ival;
float fval;
char *sval;
int bval;
}
%token <ival> NegInt
%token <ival> Int
%token <fval> Float
%token <bval> Bool
%token <sval> ifHeader
%token <sval> thenHeader
%token <sval> elseHeader
%token <sval> forHeader
%token <sval> whileHeader
%token <sval> ID
%token <sval> BinOperator
%token <sval> BoolOperator
%token <sval> Assignment
%token <sval> Quotation
%token <sval> LBracket
%token <sval> RBracket
%token <sval> LFBracket
%token <sval> RFBracket
%token <sval> Semi
%token <sval> LABracket
%token <sval> RABracket
%token <sval> Comma
%token <sval> String
%%
codeBlock: varDecls {printf("why doth this triggering?\n");}
| ifExprs {printf("codeBlock Statement \n");}
;
ifExprs: ifExprs ifStmt
| ifStmt
;
ifStmt: ifExpr | ifExprElse;
ifExprElse: ifExpr elseExpr;
ifExpr: ifHeader LBracket boolExpr RBracket thenExpr;
thenExpr: thenHeader LFBracket varDecls RFBracket;
elseExpr: elseHeader LFBracket varDecls RFBracket;
varDecls: varDecls varDecl
| varDecl
;
varDecl: ID Assignment numStmt Semi
| ID Assignment strExpr Semi
| ID Assignment boolExpr Semi
| ID Assignment Bool Semi {printf("why is this triggering?\n");}
;
boolExpr: ID BoolOperator ID
| ID BoolOperator numExpr
| ID BoolOperator Bool
;
strExpr: Quotation ID Quotation
;
numStmt: numStmt BinOperator numExpr
| numExpr
;
numExpr: LBracket numStmt RBracket
| Int
| Float
| NegInt
;
%%
int main(int argc, char* argv[]) {
yyparse();
}
Here is my scanner:
%{
#include "y.tab.h"
extern YYSTYPE yylval;
%}
%option yylineno
Digit [0-9]
Letter [a-zA-Z]
Word [a-z][a-zA-Z0-9_]*
%%
"-"{Digit}+ {
//printf("\n An assignment: %s \n", yytext);
yylval.ival = atoi(yytext);
return NegInt;
}
{Digit}+ {
//printf("\n An assignment: %s \n", yytext);
yylval.ival = atoi(yytext);
return Int;
}
{Digit}+"."{Digit}+ {
//printf("\n A float: %s (%f)\n", yytext, atof(yytext));
yylval.ival=atof(yytext);
return Float;
}
True|False {
//printf("\n A Boolean: %s \n", yytext);
yylval.bval = atoi(yytext);
return Bool;
}
if {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return ifHeader;
}
then {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return thenHeader;
}
else {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return elseHeader;
}
for {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return forHeader;
}
while {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return whileHeader;
}
{Word} {
//printf("\n An Identifier: %s \n", yytext);
yylval.sval = yytext;
return ID;
}
"'" {
//printf("\n An Identifier: %s \n", yytext);
yylval.sval = yytext;
return Quotation;
}
"+"|"-"|"*"|"/" {
//printf("\n An Operator: %s \n", yytext);
yylval.sval = yytext;
return BinOperator;
}
"<"|">"|"!="|"<="|">="|"==" {
//printf("\n An comparison: %s \n", yytext);
yylval.sval = yytext;
return BoolOperator;
}
"=" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return Assignment;
}
"(" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return LBracket;
}
")" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return RBracket;
}
"{" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return LFBracket;
}
"}" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return RFBracket;
}
";" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return Semi;
}
"[" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return LABracket;
}
"]" {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return RABracket;
}
"," {
//printf("\n An assignment: %s \n", yytext);
yylval.sval = yytext;
return Comma;
}
%
As you can see with my print statements, I was trying to trigger my parser to print out "why doth this triggering" but instead varDecls is triggered instead.
Additionally, varDecl also triggers. Is that supposed to happen?
Help would be very appreciated. Thank you

ANTLR4 parser detection

This is my first try with an ANTLR4-grammar. It should recognize a very easy statement, starting with the command 'label', followed by a colon, then an arbitrary text, ended by semicolon. But the parser does not recognize 'label' as description. Why?
grammar test;
prog: stat+;
stat:
description content
;
description:
'label' COLON
;
content:
TEXT
;
TEXT:
.*? ';'
;
STRING : '"' ('""'|~'"')* '"' ; // quote-quote is an escaped quote
COMMENT
: '//' (~('\n'|'\r'))*
;
COLON : ':' ;
ID: [a-zA-z]+;
INT: [0-9]+;
NEWLINE: '\r'? '\n';
WS : [ \t\n\r]+ -> skip ;
An example for the code:
label:
this is an error;
wronglabel:YYY
this should be a error;
The error is:
line 1:0 mismatched input 'label: \nthis is an error;' expecting 'label'
(prog label: \nthis is an error; \n\n\nwronglabel:YYY\nthis should be a error; \n)
This works much better:
grammar test;
prog: stat+;
stat:
description content
;
description:
'label' COLON
;
content:
text
;
text:
.*? ';'
;
STRING : '"' ('""'|~'"')* '"' ; // quote-quote is an escaped quote
COMMENT
: '//' (~('\n'|'\r'))*
;
COLON : ':' ;
ID: [a-zA-z]+;
NEWLINE: '\r'? '\n';
WS : [ \t\n\r]+ -> skip ;
Seems I mixed lexer and parser rules:
lexer rules have to be lower case,
parser rules uppercase.
So I changed the TEXT-rule into a text-rule.

can't find a simple error when parsing with YACC

i'm trying to make a very simple YACC parser on Pascal language which just includes integer declarations, some basic expressions and if-else statements. however, i cant find the error for hours and i'm going to be crazy soon. terminal says Error at line:0 but it is impossible!. i use flex and byacc for parser.i will be very glad if you can help me. this is my lex file as you can see;
%{
#include <stdio.h>
#include <string.h>
#include "y.tab.h"
extern int yylval;
int linenum=0;
%}
digit [0-9]
letter [A-Za-z]
%%
if return IF;
then return THEN;
else return ELSE;
for return FOR;
while return WHILE;
PROGRAM return PROGRAM_SYM;
BEGIN return BEGIN_SYM;
VAR return VAR_SYM;
END return END_SYM;
INTEGER return INTEGER_SYM;
{letter}({letter}|{digit})* return identifier;
[0-9]+ return NUMBER;
[\<][\=] return CON_LE;
[\>][\=] return CON_GE;
[\=] return CON_EQ;
[\:][\=] return ASSIGNOP;
; return semiColon;
, return comma;
\n {linenum++;}
. return (int) yytext[0];
%%
and this is my Yacc file
%{
#include <stdio.h>
#include <string.h>
#include "y.tab.h"
extern FILE *yyin;
extern int linenum;
%}
%token PROGRAM_SYM VAR_SYM BEGIN_SYM END_SYM INTEGER_SYM NUMBER
%token identifier INTEGER ASSIGNOP semiColon comma THEN
%token IF ELSE FOR WHILE
%token CON_EQ CON_LE CON_GE GE LE
%left '*' '/'
%left '+' '-'
%start program
%%
program: PROGRAM_SYM identifier semiColon VAR_SYM dec_block BEGIN_SYM statement_list END_SYM '.'
;
dec_block:
dec_list semiColon;
dec_list:
dec_list dec
|
dec
;
dec:
int_dec_list
;
int_dec_list:
int_dec_list int_dec ':' type
|
int_dec ':' type
;
int_dec:
int_dec comma identifier
|
identifier
;
type:
INTEGER_SYM
;
statement_list:
statement_list statement
|
statement
;
statement:
assignment_list
|
expression_list
|
selection_list
;
assignment_list:
assignment_list assignment
|
assignment
;
assignment:
identifier ASSIGNOP expression_list
;
expression_list:
expression_list expression semiColon
|
expression semiColon
;
expression:
'(' expression ')'
|
expression '*' expression
|
expression '/' expression
|
expression '+' expression
|
expression '-' expression
|
factor
;
factor:
identifier
|
NUMBER
;
selection_list:
selection_list selection
|
selection
;
selection:
IF '(' logical_expression ')' THEN statement_list ELSE statement_list
;
logical_expression:
logical_expression '=' expression
|
logical_expression '>' expression
|
logical_expression '<' expression
;
%%
void yyerror(char *s){
fprintf(stderr,"Error at line: %d\n",linenum);
}
int yywrap(){
return 1;
}
int main(int argc, char *argv[])
{
/* Call the lexer, then quit. */
yyin=fopen(argv[1],"r");
yyparse();
fclose(yyin);
return 0;
}
and finally i take an error at the first line when i give the input;
PROGRAM myprogram;
VAR
i:INTEGER;
i3:INTEGER;
j:INTEGER;
BEGIN
i := 3;
j := 5;
i3 := i+j*2;
i := j*20;
if(i>j)
then i3 := i+50+(45*i+(40*j));
else i3 := i+50+(45*i+(40*j))+i+50+(45*i+(30*j));
END.
For debugging grammars, YYDEBUG is your friend. Either stick #define YYDEBUG 1 in the %{..%} in the top of your .y file, or compile with -DYYDEBUG, and stick a yydebug = 1; in main before calling yyparse and you'll get a slew of info about what tokens the parser is seeing and what it is doing with them....
Your lexical analyzer returns blanks and tabs as tokens, but the grammar doesn't recognize them.
Add a parser rule:
[ \t\r] { }
This gets you to line 6 instead of line 0 before you run into an error. You get that error because you don't allow semicolons between declarations:
dec_block:
dec_list semiColon;
dec_list:
dec_list dec
|
dec
;
dec:
int_dec_list
;
That should probably be:
dec_block:
dec_block dec
|
dec
;
dec:
int_dec_list semiColon
;
Doing this gets you to line 14 in the input.
Incidentally, one of the first things I did was to ensure that the lexical analyzer tells me what it is doing, by modifying the rules like this:
if { printf("IF\n"); return IF; }
In long term code, I'd make that diagnostic output selectable at run-time.
You have a general problem with where you expect semicolons. It is also not clear that you should allow an expression_list in the rule for statement (or, maybe, 'not yet' — that might be appropriate when you have function calls, but allowing 3 + 2 / 4 as a 'statement' is not very helpful).
This grammar gets to the end of the input:
%{
#include <stdio.h>
#include <string.h>
#include "y.tab.h"
extern FILE *yyin;
extern int linenum;
%}
%token PROGRAM_SYM VAR_SYM BEGIN_SYM END_SYM INTEGER_SYM NUMBER
%token identifier INTEGER ASSIGNOP semiColon comma THEN
%token IF ELSE FOR WHILE
%token CON_EQ CON_LE CON_GE GE LE
%left '*' '/'
%left '+' '-'
%start program
%%
program: PROGRAM_SYM identifier semiColon VAR_SYM dec_block BEGIN_SYM statement_list END_SYM '.'
;
dec_block:
dec_block dec
|
dec
;
dec:
int_dec_list semiColon
;
int_dec_list:
int_dec_list int_dec ':' type
|
int_dec ':' type
;
int_dec:
int_dec comma identifier
|
identifier
;
type:
INTEGER_SYM
;
statement_list:
statement_list statement
|
statement
;
statement:
assignment
|
selection
;
assignment:
identifier ASSIGNOP expression semiColon
;
expression:
'(' expression ')'
|
expression '*' expression
|
expression '/' expression
|
expression '+' expression
|
expression '-' expression
|
factor
;
factor:
identifier
|
NUMBER
;
selection:
IF '(' logical_expression ')' THEN statement_list ELSE statement_list
;
logical_expression:
expression '=' expression
|
expression '>' expression
|
expression '<' expression
;
%%
void yyerror(char *s){
fprintf(stderr,"Error at line: %d\n",linenum);
}
int yywrap(){
return 1;
}
int main(int argc, char *argv[])
{
/* Call the lexer, then quit. */
yyin=fopen(argv[1],"r");
yyparse();
fclose(yyin);
return 0;
}
Key changes include removing assignment_list and expression_list, and modifying logical_expression so that the two sides of the expansion are expression, rather than the LHS being logical_expression (which then never had a primitive definition, leading to the problems with warnings).
There are still issues to resolve; the expression_list in the selection should be more restrictive to accurately reflect the grammar of Pascal. (You need a block where that could be a single statement or a BEGIN, statement list, END.)

Resources