Mystery about my Mini-C parsing with Flex/Bison - parsing

I'm trying scanning and parsing the mini-C code with lex and yacc, and I got some troubles.
I set tokens in scanner.l File like below,
...
"const" return tconst
"int" return tint
[A-Za-z][A-Za-z0-9]* return tident
...
and I declare production rule in parser.y File
...
dcl_specification : dcl_specifiers { semantic(8); };
dcl_specifiers : dcl_specifier { semantic(9); }
| dcl_specifiers dcl_specifier { semantic(10); };
dcl_specifier : type_qualifier { semantic(11); }
| type_specifier { semantic(12); };
type_qualifier : tconst { semantic(13); };
type_specifier : tint { semantic(14); };
...
%%
...
void semantic(int n) {
printf("%d\n", n);
}
...
then, I want to parse the text const int max=100,
result is
13
11
9
8
syntax error
I expect 10 shows up because const int might be reduced by rule 10,
but I'm wrong.
why this happens and what should I do?
please let me know

Related

Incorrect initial capacity for ArrayList

I've been going through ziglearn and have made my way to ArrayList. I understand the example given there but when I try something a bit more complex I run into errors. Based on the error it seems like my array doesn't have valid memory when it goes to append the new element, but I've set the initial capacity to 10. What am I doing wrong?
const std = #import("std");
const page_allocator = std.heap.page_allocator;
const Allocator = std.mem.Allocator;
const C = struct {
list: std.ArrayList(A),
pub fn init(allocator: Allocator) !*C {
var a = try std.ArrayList(A).initCapacity(allocator, 10);
return &C{ .list = a };
}
pub fn info(self: *C) void {
std.log.info("len {} cap {}", .{ self.list.items.len, self.list.capacity });
}
pub fn addElement(self: *C, a: A) !*C {
try self.list.append(a);
return self;
}
};
const A = struct { e: []const u8 };
test "with arraylist" {
var foo = try C.init(page_allocator);
foo.info();
_ = try foo.addElement(.{ .e = "bar" });
}
Can see below the initial capacity is not 10, it changes with each run pointing to its uninitialized. Am I missing a step to initialize memory for the ArrayList?
[default] (info): len 0 cap 140728248984328
Segmentation fault at address 0x0
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/mem/Allocator.zig:159:30: 0x219c3c in reallocAdvancedWithRetAddr__anon_4653 (test)
return self.vtable.resize(self.ptr, buf, buf_align, new_len, len_align, ret_addr);
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/mem/Allocator.zig:356:43: 0x216faf in reallocAtLeast__anon_3534 (test)
return self.reallocAdvancedWithRetAddr(old_mem, old_alignment, new_n, .at_least, #returnAddress());
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/array_list.zig:353:89: 0x215697 in ensureTotalCapacityPrecise (test)
const new_memory = try self.allocator.reallocAtLeast(self.allocatedSlice(), new_capacity);
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/array_list.zig:338:55: 0x2170f6 in ensureTotalCapacity (test)
return self.ensureTotalCapacityPrecise(better_capacity);
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/array_list.zig:377:41: 0x21577c in addOne (test)
try self.ensureTotalCapacity(newlen);
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/array_list.zig:167:49: 0x213c16 in append (test)
const new_item_ptr = try self.addOne();
^
src/main.zig:154:29: 0x213b86 in addElement (test)
try self.list.append(a);
^
src/main.zig:165:27: 0x213ce7 in test.with arraylist (test)
_ = try foo.addElement(.{ .e = "bar" });
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/test_runner.zig:63:28: 0x2164f0 in main (test)
} else test_fn.func();
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/start.zig:596:22: 0x21463b in posixCallMainAndExit (test)
root.main();
^
/home/john/zig/zig-linux-x86_64-0.10.0/lib/std/start.zig:368:5: 0x214101 in _start (test)
#call(.{ .modifier = .never_inline }, posixCallMainAndExit, .{});
^
Your C.init function returns a pointer to C which is on the stack of the init function. This pointer becomes invalid as soon as the function exits.
Just don't return it as a pointer:
pub fn init(allocator: Allocator) !C {
var a = try std.ArrayList(A).initCapacity(allocator, 10);
return C{ .list = a };
}
Or, alternatively, you could put C into the allocator's memory:
pub fn init(allocator: Allocator) !*C {
var result = try allocator.create(C);
result.list = try std.ArrayList(A).initCapacity(allocator, 10);
return result;
}
But avoid doing this unless you have a good reason, as it limits what you can do with C.

check if condition is met before executing the action in JFlex

I am writing a lexical analyzer using JFlex. When the word co is matched, we have to ignore what comes after until the end of the line (because it's a comment). For the moment, I have a boolean variable that changes to true whenever this word is matched and if an identifier or an operator is matched after co until the end of the line, I simply ignore it because I have an if condition in my Identifier and Operator token identification.
I am wondering if there is better way to do this and get rid of this if statement that appears everywhere?
Here is the code:
%% // Options of the scanner
%class Lexer
%unicode
%line
%column
%standalone
%{
private boolean isCommentOpen = false;
private void toggleIsCommentOpen() {
this.isCommentOpen = ! this.isCommentOpen;
}
private boolean getIsCommentOpen() {
return this.isCommentOpen;
}
%}
Operators = [\+\-]
Identifier = [A-Z]*
EndOfLine = \r|\n|\r\n
%%
{Operators} {
if (! getIsBlockCommentOpen() && ! getIsCommentOpen()) {
// Do Code
}
}
{Identifier} {
if (! getIsBlockCommentOpen() && ! getIsCommentOpen()) {
// Do Code
}
}
"co" {
toggleIsCommentOpen();
}
. {}
{EndOfLine} {
if (getIsCommentOpen()) {
toggleIsCommentOpen();
}
}
One way to do this is to use states in JFlex. We say that every time the word co is matched, we enter in a state named COMMENT_STATE and we do nothing until the end of the line. After the end of the line, we exit the COMMENT_STATE state. So here is the code:
%% // Options of the scanner
%class Lexer
%unicode
%line
%column
%standalone
Operators = [\+\-]
Identifier = [A-Z]*
EndOfLine = \r|\n|\r\n
%xstate YYINITIAL, COMMENT_STATE
%%
<YYINITIAL> {
"co" {yybegin(COMMENT_STATE);}
}
<COMMENT_STATE> {
{EndOfLine} {yybegin(YYINITIAL);}
. {}
}
{Operators} {// Do Code}
{Identifier} {// Do Code}
. {}
{EndOfLine} {}
With this new approach, the lexer is more simpler and it's also more readable.

Highlight / parse PSI elements inside another PSI element

I developed a Custom Language plugin based on this this tutorial.
My plugin parses key/value language files with format like below. Values can contain some HTML tags like <br>, <i>, <b>, <span> and \n. So I want to highlight these tags as separate PSI elements inside green PSI elements (values) (see pic). How can I overwrite my rules to get this?
#Section header
KEY1 = First<br>Value
KEY2 = Second\nValue
Bnf rules I use
lngFile ::= item_*
private item_ ::= (property|header|COMMENT|CRLF)
property ::= (KEY? SEPARATOR VALUE?) | KEY {
mixin="someClass"
implements="someClass"
methods=[getKey getValue getName setName getNameIdentifier getPresentation]
}
header ::= HEADER {
mixin="someClass"
implements="someClass"
methods=[getName setName getNameIdentifier getPresentation]
}
Flex
%%
%class LngLexer
%implements FlexLexer
%unicode
%function advance
%type IElementType
%eof{ return;
%eof}
CRLF=\R
WHITE_SPACE=[\ \n\t\f]
FIRST_VALUE_CHARACTER=[^ \n\f\\] | "\\"{CRLF} | "\\".
VALUE_CHARACTER=[^\n\f\\] | "\\"{CRLF} | "\\".
END_OF_LINE_COMMENT=("//")[^\r\n]*
HEADER=("#")[^\r\n]*
SEPARATOR=[:=]
KEY_CHARACTER=[^:=\ \n\t\f\\] | "\\ "
%state WAITING_VALUE
%%
<YYINITIAL> {END_OF_LINE_COMMENT} { yybegin(YYINITIAL); return LngTypes.COMMENT; }
<YYINITIAL> {HEADER} { yybegin(YYINITIAL); return LngTypes.HEADER; }
<YYINITIAL> {KEY_CHARACTER}+ { yybegin(YYINITIAL); return LngTypes.KEY; }
<YYINITIAL> {SEPARATOR} { yybegin(WAITING_VALUE); return LngTypes.SEPARATOR; }
<WAITING_VALUE> {CRLF}({CRLF}|{WHITE_SPACE})+ { yybegin(YYINITIAL); return TokenType.WHITE_SPACE; }
<WAITING_VALUE> {WHITE_SPACE}+ { yybegin(WAITING_VALUE); return TokenType.WHITE_SPACE; }
<WAITING_VALUE> {FIRST_VALUE_CHARACTER}{VALUE_CHARACTER}* { yybegin(YYINITIAL); return LngTypes.VALUE; }
({CRLF}|{WHITE_SPACE})+ { yybegin(YYINITIAL); return TokenType.WHITE_SPACE; }
[^] { return TokenType.BAD_CHARACTER; }

How to match whitespace and comments with re2c

I started very recently to use bison for writing small compiler exercises. I am having some issues with white spaces ans comments. I was trying to debug the problem and I arrived to this source that looks like what I am looking for. I tried to chnage and erase some characters as advised but didn't work.
Also during compilation I have the following error: re2c: error: line 2963, column 0: can only difference char sets.
Below the part of the code:
yy::conj_parser::symbol_type yy::yylex(lexcontext& ctx)
{
const char* anchor = ctx.cursor;
ctx.loc.step();
// Add a lambda function to avoid repetition
auto s = [&](auto func, auto&&... params) { ctx.loc.columns(ctx.cursor - anchor); return func(params..., ctx.loc); };
%{ /* Begin re2c lexer : Tokenization process starts */
re2c:yyfill:enable = 0;
re2c:define:YYCTYPE = "char";
re2c:define:YYCURSOR = "ctx.cursor";
"return" { return s(conj_parser::make_RETURN); }
"while" | "for" { return s(conj_parser::make_WHILE); }
"var" { return s(conj_parser::make_VAR); }
"if" { return s(conj_parser::make_IF); }
// Identifiers
[a-zA-Z_] [a-zA-Z_0-9]* { return s(conj_parser::make_IDENTIFIER, std::string(anchor, ctx.cursor)); }
// String and integers:
"\""" [^\"]* "\"" { return s(conj_parser::make_STRINGCONST, std::string(anchor+1, ctx.cursor-1)); }
[0-9]+ { return s(conj_parser::make_NUMCONST, std::stol(std::string(anchor, ctx.cursor))); }
// Whitespace and comments:
"\000" { return s(conj_parser::make_END); }
"\r\n" | [\r\n] { ctx.loc.lines(); return yylex(ctx); }
"//" [^\r\n]* { return yylex(ctx); }
[\t\v\b\f ] { ctx.loc.columns(); return yylex(ctx); }
Thank you very much for pointing in the right direction or shading some lights on why this error could be solved.
You really should mention which line is line 2963. Perhaps it is here, because there seems to be an extra quotation mark in that line.
"\""" [^\"]* "\""
^

How to get function definition/signature as a string in Clang?

How can I get a function's signature (or at least the entire definition?) as a string using Clang/Libclang, assuming I have its CXCursor or so?
I think that the definition may be somehow obtainable by using the cursor's extents, but I don't really know how (what function to use).
You can use this simple code to get prototype of a function ( name, return type, count of arguments and arguments[name,data type]).
string Convert(const CXString& s)
{
string result = clang_getCString(s);
clang_disposeString(s);
return result;
}
void print_function_prototype(CXCursor cursor)
{
// TODO : Print data!
auto type = clang_getCursorType(cursor);
auto function_name = Convert(clang_getCursorSpelling(cursor));
auto return_type = Convert(clang_getTypeSpelling(clang_getResultType(type)));
int num_args = clang_Cursor_getNumArguments(cursor);
for (int i = 0; i < num_args; ++i)
{
auto arg_cursor = clang_Cursor_getArgument(cursor, i);
auto arg_name = Convert(clang_getCursorSpelling(arg_cursor));
if (arg_name.empty())
{
arg_name = "no name!";
}
auto arg_data_type = Convert(clang_getTypeSpelling(clang_getArgType(type, i)));
}
}
CXChildVisitResult functionVisitor(CXCursor cursor, CXCursor /* parent */, CXClientData /* clientData */)
{
if (clang_Location_isFromMainFile(clang_getCursorLocation(cursor)) == 0)
return CXChildVisit_Continue;
CXCursorKind kind = clang_getCursorKind(cursor);
if ((kind == CXCursorKind::CXCursor_FunctionDecl || kind == CXCursorKind::CXCursor_CXXMethod || kind == CXCursorKind::CXCursor_FunctionTemplate || \
kind == CXCursorKind::CXCursor_Constructor))
{
print_function_prototype(cursor);
}
return CXChildVisit_Continue;
}
You could try using clang_Cursor_getMangling() and demangle the result in order to get the complete definition.
I'm using the following for a project I am working on and it works great for the definition. TL&DR clang_getCursorPrettyPrinted with the policy TerseOutput set to true.
std::string getStdString(const CXString &s)
{
std::string rv = clang_getCString(s);
clang_disposeString(s);
return rv;
}
bool isFunctionImplementation(CXCursor &cursor,std::string &decl,std::string &filename,unsigned &lineno)
{
std::string cs = getStdString(clang_getCursorPrettyPrinted(cursor,nullptr));
if (cs.find('{') == std::string::npos) // Just a declaration, not the "meat" of the function, so we dont care
return false;
clang::LangOptions lo;
struct clang::PrintingPolicy pol(lo);
pol.adjustForCPlusPlus();
pol.TerseOutput = true;
pol.FullyQualifiedName = true;
decl = getStdString(clang_getCursorPrettyPrinted(cursor,&pol));
CXSourceLocation location = clang_getCursorLocation( cursor );
CXFile f;
lineno = 0;
filename = "(None)";
clang_getSpellingLocation(location,&f,&lineno,nullptr,nullptr);
if (lineno)
{
filename = getStdString(clang_File_tryGetRealPathName(f));
}
return isAllowedDirectory(filename);
}
This one checks if the function call is "meat" or just a definition. Obviously you can adjust as needed, including writing your own isAllowedDirectory function. Just pass the cursor, two strings and an unsigned into this as you walk your AST when you hit a declaration type.
I use the following, shorter way with clang 10 (though its using a matcher, not a cursor):
The 2 helper functions are general helpers to get the code snippet as a string.
// helper function 1: find position of end of token
SourceLocation
end_of_the_end(SourceLocation const & start_of_end, SourceManager & sm)
{
using namespace clang;
LangOptions lopt;
return Lexer::getLocForEndOfToken(start_of_end, 0, sm, lopt);
}
// helper function 2:
std::string getSymbolString(clang::SourceManager & sm,
const clang::SourceRange & range)
{
return std::string(sm.getCharacterData(range.getBegin()),
sm.getCharacterData(end_of_the_end(range.getEnd(), sm)));
}
The actual code snippet to get the function declaration string is:
// ... in run() of a matcher:
virtual void run(corct::result_t const & result) override
{
using namespace clang;
FunctionDecl * f_decl = const_cast<FunctionDecl *>(
result.Nodes.getNodeAs<FunctionDecl>(fd_bd_name_));
if(f_decl) {
SourceManager & sm(result.Context->getSourceManager());
FunctionDecl * f_decl = const_cast<FunctionDecl *>(
result.Nodes.getNodeAs<FunctionDecl>(fd_bd_name_));
auto full_decl_string =
getSymbolString(sm, f_decl->DeclaratorDecl::getSourceRange());
}
}
This will output inline bool test2(std::string const & str, std::vector<std::string> & sss) for the following function:
inline bool test2(std::string const & str, std::vector<std::string> & sss)
{
return true;
}

Resources