Cannot identify the error yylex() produces - flex-lexer

I'm trying to make a .l file for a Pascal-like language. As I'm running it with g++ it crushes after the 20th ish step in the parsing process with different files, one has more definitions in it and one fewer. I tried to get the error but it only send 3 zeros. Did I miss something somewhere?
This is the Utile.h file
#include <map>
#include <iterator>
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
std::vector<std::string> TS;
typedef struct {
int n;
int elem[20][2];
} FIP;
void addFIP(int code, int posTS, FIP& f){
f.elem[f.n][0]=code;
f.elem[f.n++][1]=posTS;
}
FIP fip;
int pozTS=0;
void printFIP(FIP& f){
ofstream fipFile;
fipFile.open("FIP.txt");
cout<<"nr elem fip"<<f.n<<endl;
for(int i=0;i<f.n;i++)
fipFile<<f.elem[0]<<" "<<f.elem[1]<<endl;
fipFile.close();
}
And this is my specs.l file
%{
#include "Utile.h"
%}
%option noyywrap
%option caseless
LETTER [A-Za-z]
NR_ZEC [0-9]
NR_NZ [1-9]
ZERO [0]
ID {LETTER}({LETTER}|{NR_ZEC})*
NR_BASE10 {NR_NZ}+{NR_ZEC}*|{ZERO}
NR_REAL {NR_BASE10}"."{NR_BASE10}*
DELIMIT [;.,:]
SIR_CAR [\"][^\n]*[\"]
CARACTER "'"[^\n]"'"
ERR_NR_START [0-9]+[a-zA-Z0-9]*
DOT "\."
COLON "\:"
SEMICOLON "\;"
COMMA "\,"
PLUS "\+"
%%
[ \t\n]
[0-9]+[a-zA-Z]+[a-zA-Z0-9]* {printf("Eroare - identificator incepe cu cifra %s \n", yytext);}
read {addFIP(19,-1,fip);printf("%s\n", yytext);}
write {addFIP(20,-1,fip);printf("%s\n", yytext);}
then {addFIP(21,-1,fip);printf("%s\n", yytext);}
variabiles {addFIP(22,-1,fip);printf("%s\n", yytext);}
"=" {addFIP(200,-1,fip);printf("%s\n", yytext);}
\( {addFIP(101,-1,fip);printf("%s\n", yytext);}
\) {addFIP(102,-1,fip);printf("%s\n", yytext);}
\; {addFIP(103,-1,fip);printf("%s\n", yytext);}
\, {addFIP(104,-1,fip);printf("%s\n", yytext);}
\. {addFIP(105,-1,fip);printf("%s\n", yytext);}
\: {addFIP(106,-1,fip);printf("%s\n", yytext);}
"+" {addFIP(300,-1,fip);printf("%s", yytext);}
\- {addFIP(301,-1,fip);printf("%s", yytext);}
integer {addFIP(29,-1,fip);printf("%s", yytext);}
real {addFIP(30,-1,fip);printf("%s", yytext);}
{ID} {addFIP(0,pozTS++,fip);printf("%s\n", yytext);}
{NR_BASE10} {
addFIP(1,pozTS++,fip);
printf("\n%d\n", 1);
}
{NR_REAL} {
addFIP(1,pozTS++,fip);
printf("\n%d\n", 1);
}
"'"[^\n]"'" {
addFIP(1,pozTS++,fip);
printf("\n%d\n", 1);
}
{SIR_CAR} {addFIP(1,pozTS++,fip);printf("\n%d\n", 1);}
. printf("Error %s\n", yytext);
%%
void yyerror (char const *s) {
fprintf (stderr, "%s\n", s);
}
extern FILE *yyin;
main(int argc, char *argv[])
{
yyin= fopen (argv[1] , "r");
yylex();
cout<<yytext;
fclose(yyin);
}
I choose to print the yytext hoping that it will help me figure out where the problem is , but no luck
Also if it help i run it this way
flex specs.l
g++ lex.yy.c
a.exe test.txt

Your FIP structure only has room for 20 entries and addFIP doesn't check to see if it is full before adding a new one. So after about 20 tokens you will start overwriting random memory.
Since you are using C++, why don't you just use a std::vector? You can just emplace_back the tokens, and you don't even need to keep track of how many there are since std::vector takes care of all the bookkeeping.
Having said that, there are very few reasons to create a vector of tokens. Usually you can just process the tokens one at a time.

Related

flex - Simple parser gives error: fatal flex scanner internal error--end of buffer missed

I'm trying to implement a simple parser that calculates addition, subtraction, multiplication and division using fractional numbers. Fractional numbers in this form: nominatorfdenominator like this 2f3 4f6 9f4 etc. Parser should run on REPL mode. To compile and run:
lex lexer.l
yacc -d parser.y
cc lex.yy.c y.tab.c -lm -o main
./main
flex code:
%{
#include "y.tab.h"
extern YYSTYPE yylval;
#include <math.h>
void to_int(char* num, int* arr);
%}
IDENTIFIER_ERROR [0-9][a-zA-Z0-9_]*
COMMENT ";;".*
VALUESTR \"(.*?)\"
%%
[ \t\f\v\n] { ; }
exit { return KW_EXIT; }
"+" { return OP_PLUS; }
"-" { return OP_MINUS; }
"/" { return OP_DIV; }
"*" { return OP_MULT; }
"(" { return OP_OP; }
")" { return OP_CP; }
(0)|([1-9]+"f"[1-9]*) { to_int(yytext, yylval.INT_ARR); return VALUEF; }
[a-zA-Z_]([a-zA-Z0-9_]*) { strcpy(yylval.STR, yytext); return IDENTIFIER; }
{COMMENT} { printf("%s: COMMENT\n", yytext); }
{IDENTIFIER_ERROR} { printf("%s: SYNTAX ERROR\n", yytext); exit(1); }
. { printf("%s: SYNTAX ERROR\n", yytext); exit(1); }
%%
// fractional number taken as a string, converting it to: arr[0] = nominator, arr[0] = nominator, arr[1] = denominator,
void to_int(char* num, int* arr) {
char* nominator, *denominator;
strcpy(nominator, num); // nominator contains whole number for now
strtok_r(nominator, "f", &denominator);
//printf ("lex: NUMS parsed as: %s %s\n", nominator, denominator);
arr[0] = atoi(nominator);
arr[1] = atoi(denominator);
//printf("lex: nom: %d denom: %d\n", arr[0], arr[1]);
}
int yywrap(){
return 1;
}
yacc file:
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
int yylex(void);
void yyerror(char *str);
void fractional_divide(int* num1, int* num2, int* RESULTF);
void fractional_multiply(int* num1, int* num2, int* RESULTF);
void fractional_sub(int* num1, int* num2, int* RESULTF);
void fractional_sum(int* num1, int* num2, int* RESULTF);
%}
%token KW_EXIT
%token OP_PLUS OP_MINUS OP_DIV OP_MULT OP_OP OP_CP OP_COMMA
%union{
int INTEGER;
int INT_ARR[2];
char STR[20];
};
%start START
%type<INT_ARR> EXP
%token <INT_ARR> VALUEF
%%
START : EXPLIST ;
EXPLIST : EXP | EXPLIST EXP ;
EXP: OP_OP OP_PLUS EXP EXP OP_CP { fractional_sum($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }
| OP_OP OP_MINUS EXP EXP OP_CP { fractional_sub($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }
| OP_OP OP_DIV EXP EXP OP_CP { fractional_divide($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }
| OP_OP OP_MULT EXP EXP OP_CP { fractional_multiply($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }
| VALUEF { $$[0] = $1[0]; $$[1] = $1[1];}
| KW_EXIT { printf("exiting...\n"); return 0; }
;
%%
void equalize_denominators(int* num1, int* num2) {
num1[0] *= num2[1];
num1[1] *= num2[1];
num2[0] *= num1[1];
num2[1] *= num1[1];
}
void fractional_sum(int* num1, int* num2, int* RESULTF) {
if (num1[1] != num2[1])
equalize_denominators(num1, num2);
RESULTF[0] = num1[0] + num2[0];
RESULTF[1] = num2[1];
}
void fractional_sub(int* num1, int* num2, int* RESULTF) {
if (num1[1] != num2[1])
equalize_denominators(num1, num2);
RESULTF[0] = num1[0] - num2[0];
RESULTF[1] = num2[1];
}
void fractional_divide(int* num1, int* num2, int* RESULTF) {
RESULTF[0] = num1[0] * num2[1];
RESULTF[1] = num1[1] * num2[0];
}
void fractional_multiply(int* num1, int* num2, int* RESULTF) {
RESULTF[0] = num1[0] * num2[0];
RESULTF[1] = num1[1] * num2[1];
}
void yyerror(char *str) {
printf("yyerror: %s\n", str);
}
int main(int argc, char *argv[]){
if(argc == 1)
yyparse();
else {
printf("Input error. Exiting...\n");
exit(1);
}
return 0;
}
sample output, first line is ok, but when I hit the enter after second line I get this error:
(+ 2f3 1f3)
result: 3f3
(* 2f1 2f6)
result: 4f6
fatal flex scanner internal error--end of buffer missed
That error message can occur in some specific circumstances involving the use of yymore() in the last token in the input, but probably the most common cause is memory corruption, which is what you've managed to do here.
It's likely that the issue is in to_int, where you do a strcpy whose destination is an uninitialised pointer:
void to_int(char* num, int* arr) {
char* nominator, *denominator;
strcpy(nominator, num); // FIXME nominator is uninitialised
It's actually not clear to me why you feel the need to make a copy of the argument num, since you are calling it with yytext. You're free to modify the contents of yytext as long as you don't write outside of its memory area. (The variable yyleng tells you how long yytext is.) Since strtok does not modify it's argument outside of the contents area, it's safe to apply to yytext. But if you are going to copy num, you obviously have to copy it to an actual validly initialized pointer; otherwise chaos will ensue. (Which could include triggering random flex error messages.)
I didn't check your code thoroughly nor did I attempt to run it, so there may be other errors. In particular, I did notice a couple of problems with your token patterns:
(0)|([1-9]+"f"[1-9]*) does not allow 10f2 or 2f103, since you only allow integers written with digits 1 through 9. It also allows 2f, whose meaning is opaque to me, and your to_int function could blow up on it. (At best, it would end up with a denominator of 0, which is also an error.) I'd recommend using two patterns, one for integers and the other for fractions:
0|[1-9][0-9]* { yylval.INT_ARG[0] = atoi(yytext);
yylval.INT_ARG[1] = 1;
return VALUEF;
}
0|[1-9][0-9]*f[1-9][0-9]* {
to_int(yytext, yylval.INT_ARR);
return VALUEF;
}
But you might want to add more meaningful error messages for illegal numbers like 03f2 and 3f0.
Although you don't use it anywhere, your pattern for character strings is incorrect, since (f)lex does not implement non-greedy matching. A better pattern would be \"[^"]*\" or \"[^"\n]*\" (which prohibits newlines inside strings); even better would be to allow backslash escapes with something like \"(\\.|[^"\\\n])*\". There are lots of other variants but that basically covers the general principle. (Some of us prefer ["] to \" but that's just stylistic; the meaning is the same.)
Also, it is bad style to call exit from a yylex action. It's better to arrange for some kind of error return. Similarly, you should not use a return statement from a yyparse action, since it leaves the parser's internal state inconsistent, and does not allow the parser to free the resources it has allocated. Use YY_ACCEPT (or YY_ABORT if you want to signal a syntax error). These are described in the documentation or any good guide.

cudaMemcpy invalid argument: in simple vector example

The following example:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <math.h>
#define N 100
#define t_num 256
int main(){
int vector_one_h[t_num], vector_one_g[t_num];
cudaError_t err = cudaMalloc((void**)&vector_one_g, t_num * sizeof(int));
printf("Cuda malloc vector swap one: %s \n", cudaGetErrorString(err));
printf("Device Vector: %p \n:" , vector_one_g);
for(int m = 0; m < t_num; m++){
vector_one_h[m] = rand() % N;
}
err = cudaMemcpy(vector_one_g, vector_one_h, t_num * sizeof(int), cudaMemcpyHostToDevice);
printf("Cuda mem copy vector swap one: %s \n", cudaGetErrorString(err));
}
Will return:
Cuda malloc vector swap one: no error
Device Vector: 0x7ffcf028eea0
:Cuda mem copy vector swap one: invalid argument
So why is cudaMemcpy receiving an invalid argument?
From the documentation for cudaMemcpy() here I thought the problem may be that I need to give the second argument as the address, &vector_one_h, but placing that in the code returns the exact same error.
And also, while there are many posts about cudaMemcpy invalid arguments, I believe this is not a duplicate as most of the other questions have very complicated examples while this is a very simple and minimal example.
Try changing the first line to:
int vector_one_h[t_num], *vector_one_g;
BTW, prefixing an array name with an & has no effect. Array names are constant pointers by themselves, by the definition of C syntax.

How to reduce parser stack or 'unshift' the current token depending on what follows?

Given the following language described as:
formally: (identifier operator identifier+)*
in plain English: zero or more operations written as an identifier (the lvalue), then an operator, then one or more identifiers (the rvalue)
An example of a sequence of operations in that language would be, given the arbitrary operator #:
A # B C X # Y
Whitespace is not significant and it may also be written more clearly as:
A # B C
X # Y
How would you parse this with a yacc-like LALR parser ?
What I tried so far
I know how to parse explicitly delimited operations, say A # B C ; X # Y but I would like to know if parsing the above input is feasible and how. Hereafter is a (non-functional) minimal example using Flex/Bison.
lex.l:
%{
#include "y.tab.h"
%}
%option noyywrap
%option yylineno
%%
[a-zA-Z][a-zA-Z0-9_]* { return ID; }
# { return OP; }
[ \t\r\n]+ ; /* ignore whitespace */
. { return ERROR; } /* any other character causes parse error */
%%
yacc.y:
%{
#include <stdio.h>
extern int yylineno;
void yyerror(const char *str);
int yylex();
%}
%define parse.lac full
%define parse.error verbose
%token ID OP ERROR
%left OP
%start opdefs
%%
opright:
| opright ID
;
opdef: ID OP ID opright
;
opdefs:
| opdefs opdef
;
%%
void yyerror(const char *str) {
fprintf(stderr, "error#%d: %s\n", yylineno, str);
}
int main(int argc, char *argv[]) {
yyparse();
}
Build with: $ flex lex.l && yacc -d yacc.y --report=all --verbose && gcc lex.yy.c y.tab.c
The issue: I cannot get the parser to not include the next lvalue identifier to the rvalue of the first operation.
$ ./a.out
A # B C X # Y
error#1: syntax error, unexpected OP, expecting $end or ID
The above is always parsed as: reduce(A # B reduce(C X)) # Y
I get the feeling I have to somehow put a condition on the lookahead token that says that if it is the operator, the last identifier should not be shifted and the current stack should be reduced:
A # B C X # Y
^ * // ^: current, *: lookahead
-> reduce 'A # B C' !
-> shift 'X' !
I tried all kind of operator precedence arrangements but cannot get it to work.
I would be willing to accept a solution that does not apply to Bison as well.
A naïve grammar for that language is LALR(2), and bison does not generate LALR(2) parsers.
Any LALR(2) grammar can be mechanically modified to produce an LALR(1) grammar with a compatible parse tree, but I don't know of any automatic tool which does that.
It's possible but annoying to do the transformation by hand, but be aware that you will need to adjust the actions in order to recover the correct parse tree:
%{
typedef struct IdList { char* id; struct IdList* next; };
typedef struct Def { char* lhs; IdList* rhs; };
typedef struct DefList { Def* def; struct DefList* next; };
%}
union {
Def* def;
DefList* defs;
char* id;
}
%type <def> ophead
%type <defs> opdefs
%token <id> ID
%%
prog : opdefs { $1->def->rhs = IdList_reverse($1->def->rhs);
DefList_show(DefList_reverse($1)); }
ophead: ID '#' ID { $$ = Def_new($1);
$$->rhs = IdList_push($$->rhs, $3); }
opdefs: ophead { $$ = DefList_push(NULL, $1); }
| opdefs ID { $1->def->rhs = IdList_push($1->def->rhs, $2); }
| opdefs ophead { $1->def->rhs = IdList_reverse($1->def->rhs);
$$ = DefList_push($1, $2); }
This precise problem is, ironically, part of bison itself, because productions do not require a ; terminator. Bison uses itself to generate a parser, and it solves this problem in the lexer rather than jumping through the loops as outlined above. In the lexer, once an ID is found, the scan continues up to the next non-whitespace character. If that is a :, then the lexer returns an identifier-definition token; otherwise, the non-whitespace character is returned to the input stream, and an ordinary identifier token is returned.
Here's one way of implementing that in the lexer:
%x SEEK_AT
%%
/* See below for explanation, if needed */
static int deferred_eof = 0;
if (deferred_eof) { deferred_eof = 0; return 0; }
[[:alpha:]][[:alnum:]_]* yylval = strdup(yytext); BEGIN(SEEK_AT);
[[:space:]]+ ; /* ignore whitespace */
/* Could be other rules here */
. return *yytext; /* Let the parser handle errors */
<SEEK_AT>{
[[:space:]]+ ; /* ignore whitespace */
"#" BEGIN(INITIAL); return ID_AT;
. BEGIN(INITIAL); yyless(0); return ID;
<EOF> BEGIN(INITIAL); deferred_eof = 1; return ID;
}
In the SEEK_AT start condition, we're only interested in #. If we find one, then the ID was the start of a def, and we return the correct token type. If we find anything else (other than whitespace), we return the character to the input stream using yyless, and return the ID token type. Note that yylval was already set from the initial scan of the ID, so there is no need to worry about it here.
The only complicated bit of the above code is the EOF handling. Once an EOF has been detected, it is not possible to reinsert it into the input stream, neither with yyless nor with unputc. Nor is it legal to let the scanner read the EOF again. So it needs to be fully dealt with. Unfortunately, in the SEEK_AT start condition, fully dealing with EOF requires sending two tokens: first the already detected ID token, and then the 0 which yyparse will recognize as end of input. Without a push-parser, we cannot send two tokens from a single scanner action, so we need to register the fact of having received an EOF, and check for that on the next call to the scanner.
Indented code before the first rule is inserted at the top of the yylex function, so it can declare local variables and do whatever needs to be done before the scan starts. As written, this lexer is not re-entrant, but it is restartable because the persistent state is reset in the if (deferred_eof) action. To make it re-entrant, you'd only need to put deferred_eof in the yystate structure instead of making it a static local.
Following rici's useful comment and answer, here is what I came up with:
lex.l:
%{
#include "y.tab.h"
%}
%option noyywrap
%option yylineno
%%
[a-zA-Z][a-zA-Z0-9_]* { yylval.a = strdup(yytext); return ID; }
# { return OP; }
[ \t\r\n]+ ; /* ignore whitespace */
. { return ERROR; } /* any other character causes parse error */
%%
yacc.y:
%{
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
extern int yylineno;
void yyerror(const char *str);
int yylex();
#define STR_OP " # "
#define STR_SPACE " "
char *concat3(const char *, const char *, const char *);
struct oplist {
char **ops;
size_t capacity, count;
} my_oplist = { NULL, 0, 0 };
int oplist_append(struct oplist *, char *);
void oplist_clear(struct oplist *);
void oplist_dump(struct oplist *);
%}
%union {
char *a;
}
%define parse.lac full
%define parse.error verbose
%token ID OP END ERROR
%start input
%%
opbase: ID OP ID {
char *s = concat3($<a>1, STR_OP, $<a>3);
free($<a>1);
free($<a>3);
assert(s && "opbase: allocation failed");
$<a>$ = s;
}
;
ops: opbase {
$<a>$ = $<a>1;
}
| ops opbase {
int r = oplist_append(&my_oplist, $<a>1);
assert(r == 0 && "ops: allocation failed");
$<a>$ = $<a>2;
}
| ops ID {
char *s = concat3($<a>1, STR_SPACE, $<a>2);
free($<a>1);
free($<a>2);
assert(s && "ops: allocation failed");
$<a>$ = s;
}
;
input: ops {
int r = oplist_append(&my_oplist, $<a>1);
assert(r == 0 && "input: allocation failed");
}
;
%%
char *concat3(const char *s1, const char *s2, const char *s3) {
size_t len = strlen(s1) + strlen(s2) + strlen(s3);
char *s = malloc(len + 1);
if (!s)
goto concat3__end;
sprintf(s, "%s%s%s", s1, s2, s3);
concat3__end:
return s;
}
int oplist_append(struct oplist *oplist, char *op) {
if (oplist->count == oplist->capacity) {
char **ops = realloc(oplist->ops, (oplist->capacity + 32) * sizeof(char *));
if (!ops)
return 1;
oplist->ops = ops;
oplist->capacity += 32;
}
oplist->ops[oplist->count++] = op;
return 0;
}
void oplist_clear(struct oplist *oplist) {
if (oplist->count > 0) {
for (size_t i = 0; i < oplist->count; ++i)
free(oplist->ops[i]);
oplist->count = 0;
}
if (oplist->capacity > 0) {
free(oplist->ops);
oplist->capacity = 0;
}
}
void oplist_dump(struct oplist *oplist) {
for (size_t i = 0; i < oplist->count; ++i)
printf("%2zu: '%s'\n", i, oplist->ops[i]);
}
void yyerror(const char *str) {
fprintf(stderr, "error#%d: %s\n", yylineno, str);
}
int main(int argc, char *argv[]) {
yyparse();
oplist_dump(&my_oplist);
oplist_clear(&my_oplist);
}
Output with A # B C X # Y:
0: 'A # B C'
1: 'X # Y'

In Flex when is yylineno updated?

I want to use flex to get the current line number. it seems that flex has a global variable yylineno to keep the current line number when compile.
It is sure that yylineno will increment by 1 when \n is matched. but does ‘r$’ which match a
string at the end of a line change yylineno too? otherwise, are there anyelse situations where yylineno is updated?
For example, I have a source file which is 71 lines in total
/*
Author: guanwanxian
date: 2014-12-29
*/
#include "cstdio"
#include "iostream"
#include "cmath"
#include "tchar.h"
using namespace std;
#define MAX 10000
//This is a struct to represent a Point in two-dimension plane
//Take a test
struct Point{
Point(double XPos_N,double YPos_N){
m_XPos=XPos_N;
m_YPos=YPos_N;
}
double CalDistanceWithAnotherPoint(Point& OtherPoint)
{
double Dis=sqrt((m_XPos-OtherPoint.m_XPos)*(m_XPos-OtherPoint.m_XPos)+(m_YPos-OtherPoint.m_YPos)*(m_YPos-OtherPoint.m_YPos));
return Dis;
}
double m_XPos;
double m_YPos;
};
//this is a function to print Hello World
void PrintHelloWorld()
{
for(int i=0;i<10;i++)
{
printf("Hello World\n");
}
}
/*
this is a function to calculate the sun of two integers
balabala
2014-12-31
*/
int CalSum(int x , int y)
{
int sum=x+y;
return sum;
}
/*
this is the Main function
this is the enterance of my program
this is just a test program
*/
int _tmain(int argc, _TCHAR* argv[])
{
int A=23;
int B=34;
int SumOfAB=CalSum(A,B);
_tprintf(_T("The sum of A and B is:%d \n"),SumOfAB);
PrintHelloWorld();
Point AP(0,0);
Point BP(2,3);
double DisBetAP_AND_BP=AP.CalDistanceWithAnotherPoint(BP);
_tprintf(_T("The distance between AP and BP is:%lf\n"),DisBetAP_AND_BP);
return 0;
}
And my flex file is :
%option noyywrap
%option yylineno
%{
#include <cstdlib>
#include <iostream>
#include "tchar.h"
#include "parser.hpp"
extern int SourceFileLength;//The size of input file
// this function will be generated using bison
extern int yyparse();
int DigitNum=0;
int CommentLineNum=0;
int ProgramLineNum=0;
%}
Digits [0-9]+
BinoOP [-+*/]
parenthesis [()]
%s IN_BLOCK_COMMENT
%s IN_SINGLELINE_COMMENT
%s NOT_COMMENT
%s IN_FUNCTION
%%
<INITIAL>{
"//" {
BEGIN(IN_SINGLELINE_COMMENT);
std::cout<< "enter single line comment\n";
}
"/*" {
std::cout<<"block line num: "<<yylineno<<std::endl;
BEGIN(IN_BLOCK_COMMENT);
std::cout<< "enter block comment\n";
}
([^\/\ \n][^\ \n]*)|(\/[^\*\/\ \n][^\ \n]*)|(\/) { std::cout << yytext <<std::endl;}
\n {std::cout << std::endl; ProgramLineNum++; }
<<EOF>> { std::cout<<"TotalLine: "<<yylineno<<std::endl; std::cout<<"current position: "<<ftell(yyin)<<std::endl; ProgramLineNum++; std::cout<<"File Size: "<<SourceFileLength<<std::endl; return(0);}
. {}
}
<IN_BLOCK_COMMENT>{
"*/" { BEGIN(INITIAL); std::cout << "leave block comment\n" << std::endl; CommentLineNum++; }
[^*\n]+ { std::cout << "BlockLine\n"; }//eat comment in chunks
"*" { std::cout << "\"*\" " << std::endl;}//eat the lone star
"\n" { std::cout <<std::endl; CommentLineNum++; ProgramLineNum++;}
}
<IN_SINGLELINE_COMMENT>{
.*$ { std::cout<<"curretn yyline: "<<yylineno<<std::endl; BEGIN(INITIAL); std::cout<< "SingleLine\n"; std::cout<< "leave single line comment\n"<<std::endl; CommentLineNum++; }//单行注释,包括只有//的情况
}
<NOT_COMMENT>{
}
<IN_FUNCTION>{
BEGIN(INITIAL);
}
The Answer is 75 lines instead of 71 lines. Because the patter .*$ has been matched three times and the initial yylineno seems to be 1, so the answer is 1+71+3=75. am i right?
does r$ which matches a string at end of line change yylineno too?
No.
Smilarly, it is incorrect to increment CommentLineNum in the rule <IN_SINGLE_LINE_COMMENT>.*$. This rule does not consume a line terminator.

Error when using Flex to generate a scanner (unrecognised rule)

I am trying to generate a simple scanner using Flex. However, when using the following code, I get multiple "unrecognised rule" errors on lines 23,24 and 25. After studying some similar examples, I still can't find any formatting mistakes. Can someone please point me to the right direction?
%{
#include <stdio.h>
#include "mylang3.tab.h"
#include <stdlib.h>
#include <string.h>
#define OK 234
#define ILLEGAL 235
%}
digit [0-9]
letter [a-zA-Z]
invalid_id [char|else|if|class|new|return|void|while]
unsigned_int ({digit}+)
INTEGER ((+|-)?{unsigned_int})
all_chars [{letter}{digit}_]
ID ([({all_chars}{-}{digit})({all_chars})*]{-}{invalid_id})
special_char ["\n"|"\""|"\'"|"\0"|"\t"|"\\"]
CHARACTER '([[:print:]]{-}["']{+}{special_char})'
%%
[a-zA-Z]+ printf("I have found a word %s\n", yytext);
{ID} printf("I have found an id %s\n", yytext); //errors
{INTEGER} printf("I have found an integer %s\n",yytext); //errors
{CHARACTER} printf("I have found a char %s\n",yytext); //errors
char|else|if|class|new|return|void|while printf("I have found a reserved word %s\n",yytext);
"+"|"-"|"*"|"/"|"{"|"}"|"("|")"|"["|"]" printf("I have found an operator: %s\n", yytext );
[" " \t\n]+ /* eat up whitespace */
. printf( "Unrecognized character: %s\n", yytext );
%%
/*int main(int argc, char** argv){
int token;
int ok=1;
++argv, --argc;
if ( argc > 0 )
yyin = fopen( argv[0], "r" );
else
yyin = stdin;
yylex();
while((token =yylex())!=0){
if(token==ILLEGAL){ printf("Illegal sequence\n"); ok=0; }
}
if(ok==0) printf("Encountered lexical errors\n");
else printf("No lexical errors found\n");
return 0;
}*/
You can only use square brackets for characters, not for sequences of characters. So instead of e. g.
all_chars [{letter}{digit}_]
you'll have to write
all_chars ({letter}|{digit}|_)
And you shouldn't mix pipe signs and square brackets. [abc] means the same as (a|b|c), but [a|b|c] is wrong.

Resources