flex code:
1 %option noyywrap nodefault yylineno case-insensitive
2 %{
3 #include "stdio.h"
4 #include "tp.tab.h"
5 %}
6
7 %%
8 "{" {return '{';}
9 "}" {return '}';}
10 ";" {return ';';}
11 "create" {return CREATE;}
12 "cmd" {return CMD;}
13 "int" {yylval.intval = 20;return INT;}
14 [a-zA-Z]+ {yylval.strval = yytext;printf("id:%s\n" , yylval.strval);return ID;}
15 [ \t\n]
16 <<EOF>> {return 0;}
17 . {printf("mistery char\n");}
18
bison code:
1 %{
2 #include "stdlib.h"
3 #include "stdio.h"
4 #include "stdarg.h"
5 void yyerror(char *s, ...);
6 #define YYDEBUG 1
7 int yydebug = 1;
8 %}
9
10 %union{
11 char *strval;
12 int intval;
13 }
14
15 %token <strval> ID
16 %token <intval> INT
17 %token CREATE
18 %token CMD
19
20 %type <strval> col_definition
21 %type <intval> create_type
22 %start stmt_list
23
24 %%
25 stmt_list:stmt ';'
26 | stmt_list stmt ';'
27 ;
28
29 stmt:create_cmd_stmt {/*printf("create cmd\n");*/}
30 ;
31
32 create_cmd_stmt:CREATE CMD ID'{'create_col_list'}' {printf("%s\n" , $3);}
33 ;
34 create_col_list:col_definition
35 | create_col_list col_definition
36 ;
37
38 col_definition:create_type ID ';' {printf("%d , %s\n" , $1, $2);}
39 ;
40
41 create_type:INT {$$ = $1;}
42 ;
43
44 %%
45 extern FILE *yyin;
46
47 void
48 yyerror(char *s, ...)
49 {
50 extern yylineno;
51 va_list ap;
52 va_start(ap, s);
53 fprintf(stderr, "%d: error: ", yylineno);
54 vfprintf(stderr, s, ap);
55 fprintf(stderr, "\n");
56 }
57
58 int main(int argc , char *argv[])
59 {
60 yyin = fopen(argv[1] , "r");
61 if(!yyin){
62 printf("open file %s failed\n" ,argv[1]);
63 return -1;
64 }
65
66 if(!yyparse()){
67 printf("parse work!\n");
68 }else{
69 printf("parse failed!\n");
70 }
71
72 fclose(yyin);
73 return 0;
74 }
75
test input file:
create cmd keeplive
{
int a;
int b;
};
test output:
root#VM-Ubuntu203001:~/test/tpp# ./a.out t1.tp
id:keeplive
id:a
20 , a;
id:b
20 , b;
keeplive
{
int a;
int b;
}
parse work!
I have two questions:
1) Why does the action at line 38 print the token ';'? For instance, "20 , a;" and "20 , b;"
2) Why does the action at line 32 print "keeplive
{
int a;
int b;
}" instead of simply "keeplive"?
Short answer:
yylval.strval = yytext;
You can't use yytext like that. The string it points to is private to the lexer and will change as soon as the flex action finishes. You need to do something like:
yylval.strval = strdup(yytext);
and then you need to make sure you free the memory afterwards.
Longer answer:
yytext is actually a pointer into the buffer containing the input. In order to make yytext work as though it were a NUL-terminated string, the flex framework overwrites the character following the token with a NUL before it does the action, and then replaces the original character when the action terminates. So strdup will work fine inside the action, but outside the action (in your bison code), you now have a pointer to the part of the buffer starting with the token. And it gets worse later, since flex will read the next part of the source into the same buffer, and now your pointer is to random garbage. There are several possible scenarios, depending on flex options, but none of them are pretty.
So the golden rule: yytext is only valid until the end of the action. If you want to keep it, copy it, and then make sure you free the storage for the copy when you no longer need it.
In almost all the lexers I've written, the ID token actually finds the identifier in a symbol table (or puts it there) and returns a pointer into the symbol table, which simplifies memory management. But you still have essentially the same memory management issue with, for example, character string literals.
Related
flex code:
1 %option noyywrap nodefault yylineno case-insensitive
2 %{
3 #include "stdio.h"
4 #include "tp.tab.h"
5 %}
6
7 %%
8 "{" {return '{';}
9 "}" {return '}';}
10 ";" {return ';';}
11 "create" {return CREATE;}
12 "cmd" {return CMD;}
13 "int" {yylval.intval = 20;return INT;}
14 [a-zA-Z]+ {yylval.strval = yytext;printf("id:%s\n" , yylval.strval);return ID;}
15 [ \t\n]
16 <<EOF>> {return 0;}
17 . {printf("mistery char\n");}
18
bison code:
1 %{
2 #include "stdlib.h"
3 #include "stdio.h"
4 #include "stdarg.h"
5 void yyerror(char *s, ...);
6 #define YYDEBUG 1
7 int yydebug = 1;
8 %}
9
10 %union{
11 char *strval;
12 int intval;
13 }
14
15 %token <strval> ID
16 %token <intval> INT
17 %token CREATE
18 %token CMD
19
20 %type <strval> col_definition
21 %type <intval> create_type
22 %start stmt_list
23
24 %%
25 stmt_list:stmt ';'
26 | stmt_list stmt ';'
27 ;
28
29 stmt:create_cmd_stmt {/*printf("create cmd\n");*/}
30 ;
31
32 create_cmd_stmt:CREATE CMD ID'{'create_col_list'}' {printf("%s\n" , $3);}
33 ;
34 create_col_list:col_definition
35 | create_col_list col_definition
36 ;
37
38 col_definition:create_type ID ';' {printf("%d , %s\n" , $1, $2);}
39 ;
40
41 create_type:INT {$$ = $1;}
42 ;
43
44 %%
45 extern FILE *yyin;
46
47 void
48 yyerror(char *s, ...)
49 {
50 extern yylineno;
51 va_list ap;
52 va_start(ap, s);
53 fprintf(stderr, "%d: error: ", yylineno);
54 vfprintf(stderr, s, ap);
55 fprintf(stderr, "\n");
56 }
57
58 int main(int argc , char *argv[])
59 {
60 yyin = fopen(argv[1] , "r");
61 if(!yyin){
62 printf("open file %s failed\n" ,argv[1]);
63 return -1;
64 }
65
66 if(!yyparse()){
67 printf("parse work!\n");
68 }else{
69 printf("parse failed!\n");
70 }
71
72 fclose(yyin);
73 return 0;
74 }
75
test input file:
create cmd keeplive
{
int a;
int b;
};
test output:
root#VM-Ubuntu203001:~/test/tpp# ./a.out t1.tp
id:keeplive
id:a
20 , a;
id:b
20 , b;
keeplive
{
int a;
int b;
}
parse work!
I have two questions:
1) Why does the action at line 38 print the token ';'? For instance, "20 , a;" and "20 , b;"
2) Why does the action at line 32 print "keeplive
{
int a;
int b;
}" instead of simply "keeplive"?
Short answer:
yylval.strval = yytext;
You can't use yytext like that. The string it points to is private to the lexer and will change as soon as the flex action finishes. You need to do something like:
yylval.strval = strdup(yytext);
and then you need to make sure you free the memory afterwards.
Longer answer:
yytext is actually a pointer into the buffer containing the input. In order to make yytext work as though it were a NUL-terminated string, the flex framework overwrites the character following the token with a NUL before it does the action, and then replaces the original character when the action terminates. So strdup will work fine inside the action, but outside the action (in your bison code), you now have a pointer to the part of the buffer starting with the token. And it gets worse later, since flex will read the next part of the source into the same buffer, and now your pointer is to random garbage. There are several possible scenarios, depending on flex options, but none of them are pretty.
So the golden rule: yytext is only valid until the end of the action. If you want to keep it, copy it, and then make sure you free the storage for the copy when you no longer need it.
In almost all the lexers I've written, the ID token actually finds the identifier in a symbol table (or puts it there) and returns a pointer into the symbol table, which simplifies memory management. But you still have essentially the same memory management issue with, for example, character string literals.
I have the following text structure. The values below JTT JNX JNA JNO belong to previous line.
9 8 11 56507785 93
JTT JNX JNA JNO
76 98
9 8 60 3269557 58
9 8 53 7269558 150
JTT JNX JNA JNO
132 71 45-7705678
9 8 62 439559 82
I'd like to parse it in order to print the corresponding values in a single line like below:
H1 H2 H3 H4 H5 JTT JNX JNA JNO
9 8 11 56507785 93 76 98
9 8 60 3269557 58
9 8 53 7269558 150 132 71 45-7705678
9 8 62 439559 82
My issue is when I use awk with FS = space (default FS) then it takes JTT as first field and JTT has 9 spaces before, so I think should be use some technique that counts how may spaces are from left until JTT JNX JNA JNO and count number of spaces from beginning until the values below JTT JNX JNA JNO in order to positionate correctly each value. I mean, 76 and 132 below JTT header, 971 below JNX, 98 below JNA and 45-7705678 below JNO.
How can this be done in awk?
$ awk --version
GNU Awk 5.0.0, API: 2.0 (GNU MPFR 4.0.2, GNU MP 6.1.2)
Copyright (C) 1989, 1991-2019 Free Software Foundation.
$ uname -srv
CYGWIN_NT-6.1 3.0.7(0.338/5/3) 2019-04-30 18:08
Thanks in advance.
With GNU awk (which you have) for FIELDWIDTHS:
$ cat tst.awk
BEGIN {
OFS = ","
print "H1", "H2", "H3", "H4", "H5", "JTT", "JNX", "JNA", "JNO"
}
!NF || ($1 == "JTT") { next }
!/^ / {
if (NR>1) {
print rec
}
FS = " "
$0 = $0
$1 = $1
rec = $0
}
/^ / {
FIELDWIDTHS = "12 5 5 *"
$0 = $0
$1 = $1
for (i=1; i<=NF; i++) {
gsub(/^\s+|\s+$/,"",$i)
}
rec = rec OFS $0
}
END {
print rec
}
.
$ awk -f tst.awk file
H1,H2,H3,H4,H5,JTT,JNX,JNA,JNO
9,8,11,56507785,93,76,,98
9,8,60,3269557,58
9,8,53,7269558,150,132,71,,45-7705678
9,8,62,439559,82
$ awk -f tst.awk file | column -s, -t
H1 H2 H3 H4 H5 JTT JNX JNA JNO
9 8 11 56507785 93 76 98
9 8 60 3269557 58
9 8 53 7269558 150 132 71 45-7705678
9 8 62 439559 82
Replace OFS="," with OFS="\t" or otherwise massage to suit...
is it possible via ctags to get the function end line number as well
"ctags -x --c-kinds=f filename.c"
Above command lists the function definition start line-numbers. Wanted a way to get the function end line numbers.
Other Approaches:
awk 'NR > first && /^}$/ { print NR; exit }' first=$FIRST_LINE filename.c
This needs the code to be properly formatted
Example:
filename.c
1 #include<stdio.h>
2 #include<stdlib.h>
3 int main()
4 {
5 const char *name;
6
7 int a=0
8 printf("name");
9 printf("sssss: %s",name);
10
11 return 0;
12 }
13
14 void code()
15 {
16 printf("Code \n");
17 }
18
19 int code2()
20 {
21 printf("code2 \n");
22 return 1
23 }
24
Input: filename and the function start line no.
Example:
Input: filename.c 3
Output: 12
Input : filename.c 19
Output : 23
Is there any better/simple way of doing this ?
C/C++ parser of Universal-ctags(https://ctags.io) has end: field.
jet#localhost tmp]$ cat -n foo.c
1 int
2 main( void )
3 {
4
5 }
6
7 int
8 bar (void)
9 {
10
11 }
12
13 struct x {
14 int y;
15 };
16
[jet#localhost tmp]$ ~/var/ctags/ctags --fields=+ne -o - --sort=no foo.c
main foo.c /^main( void )$/;" f line:2 typeref:typename:int end:5
bar foo.c /^bar (void)$/;" f line:8 typeref:typename:int end:11
x foo.c /^struct x {$/;" s line:13 file: end:15
y foo.c /^ int y;$/;" m line:14 struct:x typeref:typename:int file:
awk to the rescue!
doesn't handle curly braces within comments but should handle blocks within functions, please give it a try...
$ awk -v s=3 'NR>=s && /{/ {c++}
NR>=s && /}/ && c && !--c {print NR; exit}' file
finds the matching brace for the first one after the specified start line number s.
#include<stdio.h>
#include<conio.h>
#define FIRST_PART 7
#define LAST_PART 5
#define ALL_PARTS FIRST_PART+LAST_PART
int main()
{
printf ("The Square root of all parts is %d", ALL_PARTS * ALL_PARTS) ;
getch();
return(0);
}
In the above code the FIRST_PART is defined as 7
LAST_PART is defined as 5
and ALL_PARTS is initialized as FIRST_PART+LAST_PART (which is ideally 12)
but when i am printing ALL_PARTS * ALL_PARTS is giving me 47 as the output!(But i thought answer would be 144)
Please can anyone explain me how ?
The answer should be 47
FIRST_PART + LAST_PART * FIRST_PART + LAST_PART
MULTIPLICATION HAS MORE PRECEDENCE
SO 7 + 5 * 7 + 5
7 + 35 + 5
47
I'm writing an assembler for a custom micro controller I'm working on. I've got the assembler to a point where it will assemble instructions down to binary.
However, I'm now having problems with getting labels to work. Currently, when my assembler encounters a new label, it stores the name of the label and the memory location its referring to. When an instruction references a label, the assembler looks up the label and replaces the label with the appropriate value.
This is fine and dandy, but what if the label is defined after the instruction referencing it? Because of this, I need to have my parser run over the code twice.
Here's what I currently have for my main function:
303 int main(int argc, char* argv[])
304 {
305
306 if(argc < 1 || strcmp(argv[1],"-h")==0 || 0==strcmp(argv[1],"--help"))
307 {
308 //printf("%s\n", usage);
309 return 1;
310 }
311 // redirect stdin to the file pointer
312 int stdin = dup(0);
313 close(0);
314
315 // pass 1 on the file
316 int fp = open(argv[1], O_RDONLY, "r");
317 dup2(fp, 0);
318
319 yyparse();
320
321 lseek(fp, SEEK_SET, 0);
322
323 // pass 2 on the file
324 if(secondPassNeeded)
325 {
326 fp = open(argv[1], O_RDONLY, "r");
327 dup2(fp, 0);
328 yyparse();
329 }
330 close(fp);
331
332 // restore stdin
333 dup2(0, stdin);
334
335 for(int i = 0; i < labels.size(); i++)
336 {
337 printf("Label: %s, Loc: %d\n", labels[i].name.c_str(), labels[i].memoryLoc);
338 }
339 return 0;
340 }
I'm using this inside a flex/bison configuration.
If that is all you need, you don't need a full two-pass assembler. If the label is not defined when you reference it, you simply output a stand-in address (say 0x0000) and have a data structure that lists all of the places with forward references and what symbol they refered to. At the end of the file (or block if you have local symbols), you simply go through that list and patch the addresses.