This question already has an answer here:
Closed 10 years ago.
Possible Duplicate:
How to output the AST built using ANTLR?
Is there a way to transform my lexer and parser definition files (.g)
to a CommonTree? (During the process where my parsergenerator is created)
I dont want to use my generated parser. I want the natural antlr parser (which parses lexer and parser definition files) to spit me out a CommonTree object.
Sure, it's available here: http://www.antlr.org/grammar/ANTLR/
Posted below to prevent possible link-rot:
/*
[The "BSD licence"]
Copyright (c) 2005-2007 Terence Parr
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/** ANTLR v3 grammar written in ANTLR v3 with AST construction */
grammar ANTLRv3;
options {
output=AST;
ASTLabelType=CommonTree;
}
tokens {
DOC_COMMENT;
PARSER;
LEXER;
RULE;
BLOCK;
OPTIONAL;
CLOSURE;
POSITIVE_CLOSURE;
SYNPRED;
RANGE;
CHAR_RANGE;
EPSILON;
ALT;
EOR;
EOB;
EOA; // end of alt
ID;
ARG;
ARGLIST;
RET;
LEXER_GRAMMAR;
PARSER_GRAMMAR;
TREE_GRAMMAR;
COMBINED_GRAMMAR;
INITACTION;
LABEL; // $x used in rewrite rules
TEMPLATE;
SCOPE='scope';
SEMPRED;
GATED_SEMPRED; // {p}? =>
SYN_SEMPRED; // (...) => it's a manually-specified synpred converted to sempred
BACKTRACK_SEMPRED; // auto backtracking mode syn pred converted to sempred
FRAGMENT='fragment';
TREE_BEGIN='^(';
ROOT='^';
BANG='!';
RANGE='..';
REWRITE='->';
}
#members {
int gtype;
}
grammarDef
: DOC_COMMENT?
( 'lexer' {gtype=LEXER_GRAMMAR;} // pure lexer
| 'parser' {gtype=PARSER_GRAMMAR;} // pure parser
| 'tree' {gtype=TREE_GRAMMAR;} // a tree parser
| {gtype=COMBINED_GRAMMAR;} // merged parser/lexer
)
g='grammar' id ';' optionsSpec? tokensSpec? attrScope* action*
rule+
EOF
-> ^( {adaptor.create(gtype,$g)}
id DOC_COMMENT? optionsSpec? tokensSpec? attrScope* action* rule+
)
;
tokensSpec
: TOKENS tokenSpec+ '}' -> ^(TOKENS tokenSpec+)
;
tokenSpec
: TOKEN_REF
( '=' (lit=STRING_LITERAL|lit=CHAR_LITERAL) -> ^('=' TOKEN_REF $lit)
| -> TOKEN_REF
)
';'
;
attrScope
: 'scope' id ACTION -> ^('scope' id ACTION)
;
/** Match stuff like #parser::members {int i;} */
action
: '#' (actionScopeName '::')? id ACTION -> ^('#' actionScopeName? id ACTION)
;
/** Sometimes the scope names will collide with keywords; allow them as
* ids for action scopes.
*/
actionScopeName
: id
| l='lexer' -> ID[$l]
| p='parser' -> ID[$p]
;
optionsSpec
: OPTIONS (option ';')+ '}' -> ^(OPTIONS option+)
;
option
: id '=' optionValue -> ^('=' id optionValue)
;
optionValue
: id
| STRING_LITERAL
| CHAR_LITERAL
| INT
| s='*' -> STRING_LITERAL[$s] // used for k=*
;
rule
scope {
String name;
}
: DOC_COMMENT?
( modifier=('protected'|'public'|'private'|'fragment') )?
id {$rule::name = $id.text;}
'!'?
( arg=ARG_ACTION )?
( 'returns' rt=ARG_ACTION )?
throwsSpec? optionsSpec? ruleScopeSpec? ruleAction*
':' altList ';'
exceptionGroup?
-> ^( RULE id {modifier!=null?adaptor.create(modifier):null} ^(ARG $arg)? ^(RET $rt)?
optionsSpec? ruleScopeSpec? ruleAction*
altList
exceptionGroup?
EOR["EOR"]
)
;
/** Match stuff like #init {int i;} */
ruleAction
: '#' id ACTION -> ^('#' id ACTION)
;
throwsSpec
: 'throws' id ( ',' id )* -> ^('throws' id+)
;
ruleScopeSpec
: 'scope' ACTION -> ^('scope' ACTION)
| 'scope' id (',' id)* ';' -> ^('scope' id+)
| 'scope' ACTION
'scope' id (',' id)* ';'
-> ^('scope' ACTION id+ )
;
block
: lp='('
( (opts=optionsSpec)? ':' )?
a1=alternative rewrite ( '|' a2=alternative rewrite )*
rp=')'
-> ^( BLOCK[$lp,"BLOCK"] optionsSpec? (alternative rewrite?)+ EOB[$rp,"EOB"] )
;
altList
#init {
// must create root manually as it's used by invoked rules in real antlr tool.
// leave here to demonstrate use of {...} in rewrite rule
// it's really BLOCK[firstToken,"BLOCK"]; set line/col to previous ( or : token.
CommonTree blkRoot = (CommonTree)adaptor.create(BLOCK,input.LT(-1),"BLOCK");
}
: a1=alternative rewrite ( '|' a2=alternative rewrite )*
-> ^( {blkRoot} (alternative rewrite?)+ EOB["EOB"] )
;
alternative
#init {
Token firstToken = input.LT(1);
Token prevToken = input.LT(-1); // either : or | I think
}
: element+ -> ^(ALT[firstToken,"ALT"] element+ EOA["EOA"])
| -> ^(ALT[prevToken,"ALT"] EPSILON[prevToken,"EPSILON"] EOA["EOA"])
;
exceptionGroup
: ( exceptionHandler )+ ( finallyClause )?
| finallyClause
;
exceptionHandler
: 'catch' ARG_ACTION ACTION -> ^('catch' ARG_ACTION ACTION)
;
finallyClause
: 'finally' ACTION -> ^('finally' ACTION)
;
element
: elementNoOptionSpec
;
elementNoOptionSpec
: id (labelOp='='|labelOp='+=') atom
( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] ^($labelOp id atom) EOA["EOA"]) EOB["EOB"]))
| -> ^($labelOp id atom)
)
| id (labelOp='='|labelOp='+=') block
( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] ^($labelOp id block) EOA["EOA"]) EOB["EOB"]))
| -> ^($labelOp id block)
)
| atom
( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] atom EOA["EOA"]) EOB["EOB"]) )
| -> atom
)
| ebnf
| ACTION
| SEMPRED ( '=>' -> GATED_SEMPRED | -> SEMPRED )
| treeSpec
( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] treeSpec EOA["EOA"]) EOB["EOB"]) )
| -> treeSpec
)
;
atom: range ( (op='^'|op='!') -> ^($op range) | -> range )
| terminal
| notSet ( (op='^'|op='!') -> ^($op notSet) | -> notSet )
| RULE_REF ( arg=ARG_ACTION )? ( (op='^'|op='!') )?
-> {$arg!=null&&op!=null}? ^($op RULE_REF $arg)
-> {$arg!=null}? ^(RULE_REF $arg)
-> {$op!=null}? ^($op RULE_REF)
-> RULE_REF
;
notSet
: '~'
( notTerminal -> ^('~' notTerminal)
| block -> ^('~' block)
)
;
treeSpec
: '^(' element ( element )+ ')' -> ^(TREE_BEGIN element+)
;
/** Matches ENBF blocks (and token sets via block rule) */
ebnf
#init {
Token firstToken = input.LT(1);
}
#after {
$ebnf.tree.getToken().setLine(firstToken.getLine());
$ebnf.tree.getToken().setCharPositionInLine(firstToken.getCharPositionInLine());
}
: block
( op='?' -> ^(OPTIONAL[op] block)
| op='*' -> ^(CLOSURE[op] block)
| op='+' -> ^(POSITIVE_CLOSURE[op] block)
| '=>' // syntactic predicate
-> {gtype==COMBINED_GRAMMAR &&
Character.isUpperCase($rule::name.charAt(0))}?
// if lexer rule in combined, leave as pred for lexer
^(SYNPRED["=>"] block)
// in real antlr tool, text for SYN_SEMPRED is predname
-> SYN_SEMPRED
| -> block
)
;
range!
: c1=CHAR_LITERAL RANGE c2=CHAR_LITERAL -> ^(CHAR_RANGE[$c1,".."] $c1 $c2)
;
terminal
: ( CHAR_LITERAL -> CHAR_LITERAL
// Args are only valid for lexer rules
| TOKEN_REF
( ARG_ACTION -> ^(TOKEN_REF ARG_ACTION)
| -> TOKEN_REF
)
| STRING_LITERAL -> STRING_LITERAL
| '.' -> '.'
)
( '^' -> ^('^' $terminal)
| '!' -> ^('!' $terminal)
)?
;
notTerminal
: CHAR_LITERAL
| TOKEN_REF
| STRING_LITERAL
;
ebnfSuffix
#init {
Token op = input.LT(1);
}
: '?' -> OPTIONAL[op]
| '*' -> CLOSURE[op]
| '+' -> POSITIVE_CLOSURE[op]
;
// R E W R I T E S Y N T A X
rewrite
#init {
Token firstToken = input.LT(1);
}
: (rew+='->' preds+=SEMPRED predicated+=rewrite_alternative)*
rew2='->' last=rewrite_alternative
-> ^($rew $preds $predicated)* ^($rew2 $last)
|
;
rewrite_alternative
options {backtrack=true;}
: rewrite_template
| rewrite_tree_alternative
| /* empty rewrite */ -> ^(ALT["ALT"] EPSILON["EPSILON"] EOA["EOA"])
;
rewrite_tree_block
: lp='(' rewrite_tree_alternative ')'
-> ^(BLOCK[$lp,"BLOCK"] rewrite_tree_alternative EOB[$lp,"EOB"])
;
rewrite_tree_alternative
: rewrite_tree_element+ -> ^(ALT["ALT"] rewrite_tree_element+ EOA["EOA"])
;
rewrite_tree_element
: rewrite_tree_atom
| rewrite_tree_atom ebnfSuffix
-> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] rewrite_tree_atom EOA["EOA"]) EOB["EOB"]))
| rewrite_tree
( ebnfSuffix
-> ^(ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] rewrite_tree EOA["EOA"]) EOB["EOB"]))
| -> rewrite_tree
)
| rewrite_tree_ebnf
;
rewrite_tree_atom
: CHAR_LITERAL
| TOKEN_REF ARG_ACTION? -> ^(TOKEN_REF ARG_ACTION?) // for imaginary nodes
| RULE_REF
| STRING_LITERAL
| d='$' id -> LABEL[$d,$id.text] // reference to a label in a rewrite rule
| ACTION
;
rewrite_tree_ebnf
#init {
Token firstToken = input.LT(1);
}
#after {
$rewrite_tree_ebnf.tree.getToken().setLine(firstToken.getLine());
$rewrite_tree_ebnf.tree.getToken().setCharPositionInLine(firstToken.getCharPositionInLine());
}
: rewrite_tree_block ebnfSuffix -> ^(ebnfSuffix rewrite_tree_block)
;
rewrite_tree
: '^(' rewrite_tree_atom rewrite_tree_element* ')'
-> ^(TREE_BEGIN rewrite_tree_atom rewrite_tree_element* )
;
/** Build a tree for a template rewrite:
^(TEMPLATE (ID|ACTION) ^(ARGLIST ^(ARG ID ACTION) ...) )
where ARGLIST is always there even if no args exist.
ID can be "template" keyword. If first child is ACTION then it's
an indirect template ref
-> foo(a={...}, b={...})
-> ({string-e})(a={...}, b={...}) // e evaluates to template name
-> {%{$ID.text}} // create literal template from string (done in ActionTranslator)
-> {st-expr} // st-expr evaluates to ST
*/
rewrite_template
: // -> template(a={...},...) "..." inline template
id lp='(' rewrite_template_args ')'
( str=DOUBLE_QUOTE_STRING_LITERAL | str=DOUBLE_ANGLE_STRING_LITERAL )
-> ^(TEMPLATE[$lp,"TEMPLATE"] id rewrite_template_args $str)
| // -> foo(a={...}, ...)
rewrite_template_ref
| // -> ({expr})(a={...}, ...)
rewrite_indirect_template_head
| // -> {...}
ACTION
;
/** -> foo(a={...}, ...) */
rewrite_template_ref
: id lp='(' rewrite_template_args ')'
-> ^(TEMPLATE[$lp,"TEMPLATE"] id rewrite_template_args)
;
/** -> ({expr})(a={...}, ...) */
rewrite_indirect_template_head
: lp='(' ACTION ')' '(' rewrite_template_args ')'
-> ^(TEMPLATE[$lp,"TEMPLATE"] ACTION rewrite_template_args)
;
rewrite_template_args
: rewrite_template_arg (',' rewrite_template_arg)*
-> ^(ARGLIST rewrite_template_arg+)
| -> ARGLIST
;
rewrite_template_arg
: id '=' ACTION -> ^(ARG[$id.start] id ACTION)
;
id : TOKEN_REF -> ID[$TOKEN_REF]
| RULE_REF -> ID[$RULE_REF]
;
// L E X I C A L R U L E S
SL_COMMENT
: '//'
( ' $ANTLR ' SRC // src directive
| ~('\r'|'\n')*
)
'\r'? '\n'
{$channel=HIDDEN;}
;
ML_COMMENT
: '/*' {if (input.LA(1)=='*') $type=DOC_COMMENT; else $channel=HIDDEN;} .* '*/'
;
CHAR_LITERAL
: '\'' LITERAL_CHAR '\''
;
STRING_LITERAL
: '\'' LITERAL_CHAR LITERAL_CHAR* '\''
;
fragment
LITERAL_CHAR
: ESC
| ~('\''|'\\')
;
DOUBLE_QUOTE_STRING_LITERAL
: '"' (ESC | ~('\\'|'"'))* '"'
;
DOUBLE_ANGLE_STRING_LITERAL
: '<<' .* '>>'
;
fragment
ESC : '\\'
( 'n'
| 'r'
| 't'
| 'b'
| 'f'
| '"'
| '\''
| '\\'
| '>'
| 'u' XDIGIT XDIGIT XDIGIT XDIGIT
| . // unknown, leave as it is
)
;
fragment
XDIGIT :
'0' .. '9'
| 'a' .. 'f'
| 'A' .. 'F'
;
INT : '0'..'9'+
;
ARG_ACTION
: NESTED_ARG_ACTION
;
fragment
NESTED_ARG_ACTION :
'['
( options {greedy=false; k=1;}
: NESTED_ARG_ACTION
| ACTION_STRING_LITERAL
| ACTION_CHAR_LITERAL
| .
)*
']'
{setText(getText().substring(1, getText().length()-1));}
;
ACTION
: NESTED_ACTION ( '?' {$type = SEMPRED;} )?
;
fragment
NESTED_ACTION :
'{'
( options {greedy=false; k=2;}
: NESTED_ACTION
| SL_COMMENT
| ML_COMMENT
| ACTION_STRING_LITERAL
| ACTION_CHAR_LITERAL
| .
)*
'}'
;
fragment
ACTION_CHAR_LITERAL
: '\'' (ACTION_ESC|~('\\'|'\'')) '\''
;
fragment
ACTION_STRING_LITERAL
: '"' (ACTION_ESC|~('\\'|'"'))* '"'
;
fragment
ACTION_ESC
: '\\\''
| '\\' '"' // ANTLR doesn't like: '\\"'
| '\\' ~('\''|'"')
;
TOKEN_REF
: 'A'..'Z' ('a'..'z'|'A'..'Z'|'_'|'0'..'9')*
;
RULE_REF
: 'a'..'z' ('a'..'z'|'A'..'Z'|'_'|'0'..'9')*
;
/** Match the start of an options section. Don't allow normal
* action processing on the {...} as it's not a action.
*/
OPTIONS
: 'options' WS_LOOP '{'
;
TOKENS
: 'tokens' WS_LOOP '{'
;
/** Reset the file and line information; useful when the grammar
* has been generated so that errors are shown relative to the
* original file like the old C preprocessor used to do.
*/
fragment
SRC : 'src' ' ' file=ACTION_STRING_LITERAL ' ' line=INT
;
WS : ( ' '
| '\t'
| '\r'? '\n'
)+
{$channel=HIDDEN;}
;
fragment
WS_LOOP
: ( WS
| SL_COMMENT
| ML_COMMENT
)*
;
Related
I am creating parser and lexer rules for Decaf programming language written in ANTLR4. I'm trying to parse a test file and keep getting an error, there must be something wrong in the grammar but i cant figure it out.
My test file looks like:
class Program {
int i[10];
}
The error is : line 2:8 mismatched input '10' expecting INT_LITERAL
And here is the full Decaf.g4 grammar file
grammar Decaf;
/*
LEXER RULES
-----------
Lexer rules define the basic syntax of individual words and symbols of a
valid Decaf program. Lexer rules follow regular expression syntax.
Complete the lexer rules following the Decaf Language Specification.
*/
CLASS : 'class';
INT : 'int';
RETURN : 'return';
VOID : 'void';
IF : 'if';
ELSE : 'else';
FOR : 'for';
BREAK : 'break';
CONTINUE : 'continue';
CALLOUT : 'callout';
TRUE : 'True' ;
FALSE : 'False' ;
BOOLEAN : 'boolean';
LCURLY : '{';
RCURLY : '}';
LBRACE : '(';
RBRACE : ')';
LSQUARE : '[';
RSQUARE : ']';
ADD : '+';
SUB : '-';
MUL : '*';
DIV : '/';
EQ : '=';
SEMI : ';';
COMMA : ',';
AND : '&&';
LESS : '<';
GREATER : '>';
LESSEQUAL : '<=' ;
GREATEREQUAL : '>=' ;
EQUALTO : '==' ;
NOTEQUAL : '!=' ;
EXCLAMATION : '!';
fragment CHAR : (' '..'!') | ('#'..'&') | ('('..'[') | (']'..'~') | ('\\'[']) | ('\\"') | ('\\') | ('\t') | ('\n');
CHAR_LITERAL : '\'' CHAR '\'';
//STRING_LITERAL : '"' CHAR+ '"' ;
HEXMARK : '0x';
fragment HEXA : [a-fA-F];
fragment HEXDIGIT : DIGIT | HEXA ;
HEX_LITERAL : HEXMARK HEXDIGIT+;
STRING : '"' (ESC|.)*? '"';
fragment ESC : '\\"' | '\\\\';
fragment DIGIT : [0-9];
DECIMAL_LITERAL : DIGIT(DIGIT)*;
COMMENT : '//' ~('\n')* '\n' -> skip;
WS : (' ' | '\n' | '\t' | '\r') + -> skip;
fragment ALPHA : [a-zA-Z] | '_';
fragment ALPHA_NUM : ALPHA | DIGIT;
ID : ALPHA ALPHA_NUM*;
INT_LITERAL : DECIMAL_LITERAL | HEX_LITERAL;
BOOL_LITERAL : TRUE | FALSE;
/*
PARSER RULES
------------
Parser rules are all lower case, and make use of lexer rules defined above
and other parser rules defined below. Parser rules also follow regular
expression syntax. Complete the parser rules following the Decaf Language
Specification.
*/
program : CLASS ID LCURLY field_decl* method_decl* RCURLY EOF;
field_name : ID | ID LSQUARE INT_LITERAL RSQUARE;
field_decl : datatype field_name (COMMA field_name)* SEMI;
method_decl : (datatype | VOID) ID LBRACE ((datatype ID) (COMMA datatype ID)*)? RBRACE block;
block : LCURLY var_decl* statement* RCURLY;
var_decl : datatype ID (COMMA ID)* SEMI;
datatype : INT | BOOLEAN;
statement : location assign_op expr SEMI
| method_call SEMI
| IF LBRACE expr RBRACE block (ELSE block)?
| FOR ID EQ expr COMMA expr block
| RETURN (expr)? SEMI
| BREAK SEMI
| CONTINUE SEMI
| block;
assign_op : EQ
| ADD EQ
| SUB EQ;
method_call : method_name LBRACE (expr (COMMA expr)*)? RBRACE
| CALLOUT LBRACE STRING(COMMA callout_arg (COMMA callout_arg)*) RBRACE;
method_name : ID;
location : ID | ID LSQUARE expr RSQUARE;
expr : location
| method_call
| literal
| expr bin_op expr
| SUB expr
| EXCLAMATION expr
| LBRACE expr RBRACE;
callout_arg : expr
| STRING ;
bin_op : arith_op
| rel_op
| eq_op
| cond_op;
arith_op : ADD | SUB | MUL | DIV | '%' ;
rel_op : LESS | GREATER | LESSEQUAL | GREATEREQUAL ;
eq_op : EQUALTO | NOTEQUAL ;
cond_op : AND | '||' ;
literal : INT_LITERAL | CHAR_LITERAL | BOOL_LITERAL ;
Whenever there are 2 or more lexer rules that match the same characters, the one defined first wins. In your case, these 2 rules both match 10:
DECIMAL_LITERAL : DIGIT(DIGIT)*;
INT_LITERAL : DECIMAL_LITERAL | HEX_LITERAL;
and since INT_LITERAL is defined after DECIMAL_LITERAL, the lexer will never create a INT_LITERAL token. If you now try to use it in a parser rule, you get an error message you posted.
The solution: remove INT_LITERAL from your lexer and create a parser rule instead:
int_literal : DECIMAL_LITERAL | HEX_LITERAL;
and use int_literal in your parser rules instead.
This is my grammar:
grammar FOOL;
#header {
import java.util.ArrayList;
}
#lexer::members {
public ArrayList<String> lexicalErrors = new ArrayList<>();
}
/*------------------------------------------------------------------
* PARSER RULES
*------------------------------------------------------------------*/
prog : exp SEMIC #singleExp
| let exp SEMIC #letInExp
| (classdec)+ SEMIC (let)? exp SEMIC #classExp
;
classdec : CLASS ID ( EXTENDS ID )? (LPAR (vardec ( COMMA vardec)*)? RPAR)? (CLPAR ((fun SEMIC)+)? CRPAR)?;
let : LET (dec SEMIC)+ IN ;
vardec : type ID ;
varasm : vardec ASM exp ;
fun : type ID LPAR ( vardec ( COMMA vardec)* )? RPAR (let)? exp ;
dec : varasm #varAssignment
| fun #funDeclaration
;
type : INT
| BOOL
| ID
;
exp : left=term (operator=(PLUS | MINUS) right=term)*
;
term : left=factor (operator=(TIMES | DIV) right=factor)*
;
factor : left=value (operator=(EQ | LESSEQ | GREATEREQ | GREATER | LESS | AND | OR ) right=value)*
;
value : MINUS?INTEGER #intVal
| (NOT)? ( TRUE | FALSE ) #boolVal
| LPAR exp RPAR #baseExp
| IF cond=exp THEN CLPAR thenBranch=exp CRPAR (ELSE CLPAR elseBranch=exp CRPAR)? #ifExp
| MINUS?ID #varExp
| THIS #thisExp
| funcall #funExp
| (ID | THIS) DOT funcall #methodExp
| NEW ID ( LPAR (exp (COMMA exp)* )? RPAR)? #newExp
| PRINT ( exp ) #print
;
/* PRINT LPAR exp RPAR */
funcall
: ID ( LPAR (exp (COMMA exp)* )? RPAR )
;
/*------------------------------------------------------------------
* LEXER RULES
*------------------------------------------------------------------*/
SEMIC : ';' ;
COLON : ':' ;
COMMA : ',' ;
EQ : '==' ;
ASM : '=' ;
PLUS : '+' ;
MINUS : '-' ;
TIMES : '*' ;
DIV : '/' ;
TRUE : 'true' ;
FALSE : 'false' ;
LPAR : '(' ;
RPAR : ')' ;
CLPAR : '{' ;
CRPAR : '}' ;
IF : 'if' ;
THEN : 'then' ;
ELSE : 'else' ;
PRINT : 'print' ;
LET : 'let' ;
IN : 'in' ;
VAR : 'var' ;
FUN : 'fun' ;
INT : 'int' ;
BOOL : 'bool' ;
CLASS : 'class' ;
EXTENDS : 'extends' ;
THIS : 'this' ;
NEW : 'new' ;
DOT : '.' ;
LESSEQ : ('<=' | '=<') ;
GREATEREQ : ('>=' | '=>') ;
GREATER: '>' ;
LESS : '<' ;
AND : '&&' ;
OR : '||' ;
NOT : '!' ;
//Numbers
fragment DIGIT : '0'..'9';
INTEGER : DIGIT+;
//IDs
fragment CHAR : 'a'..'z' |'A'..'Z' ;
ID : CHAR (CHAR | DIGIT)* ;
//ESCAPED SEQUENCES
WS : (' '|'\t'|'\n'|'\r')-> skip;
LINECOMENTS : '//' (~('\n'|'\r'))* -> skip;
BLOCKCOMENTS : '/*'( ~('/'|'*')|'/'~'*'|'*'~'/'|BLOCKCOMENTS)* '*/' -> skip;
ERR_UNKNOWN_CHAR
: . { lexicalErrors.add("UNKNOWN_CHAR " + getText()); }
;
I think that there is a problem in the grammar concerning the precedence of operator.
In particular, this one
let
int x = (5-2)+4;
in
print x;
prints 7, while this one:
let
int x = 5-2+4;
in
print x;
prints 9.
Why the first one works? How can I make the second one working, only changing the grammar?
I think there is something to change in exp, term or factor.
This is the first parse tree http://it.tinypic.com/r/2nj8tqw/9 .
This is the second parse tree http://it.tinypic.com/r/2iv02z6/9 .
exp : left=term (operator=(PLUS | MINUS) right=exp)?
This produces parse tree that is causing it. Simply put, 5 - 2 + 4 will be parsed as:
term PLUS exp
2 term MINUS exp
2 term
4
This should help, although you'll have to change the evaluation logic:
exp : left=term (operator=(PLUS | MINUS) right=term)*
Same for factor and any other possible binary operations.
I'm trying to extend the grammar of the Tiny Language to treat assignment as expression. Thus it would be valid to write
a = b = 1; // -> a = (b = 1)
a = 2 * (b = 1); // contrived but valid
a = 1 = 2; // invalid
Assignment differs from other operators in two aspects. It's right associative (not a big deal), and its left-hand side is has to be a variable. So I changed the grammar like this
statement: assignmentExpr | functionCall ...;
assignmentExpr: Identifier indexes? '=' expression;
expression: assignmentExpr | condExpr;
It doesn't work, because it contains a non-LL(*) decision. I also tried this variant:
assignmentExpr: Identifier indexes? '=' (expression | condExpr);
but I got the same error. I am interested in
This specific question
Given a grammar with a non-LL(*) decision, how to find the two paths that cause the problem
How to fix it
I think you can change your grammar like this to achieve the same, without using syntactic predicates:
statement: Expr ';' | functionCall ';'...;
Expr: Identifier indexes? '=' Expr | condExpr ;
condExpr: .... and so on;
I altered Bart's example with this idea in mind:
grammar TL;
options {
output=AST;
}
tokens {
ROOT;
}
parse
: stat+ EOF -> ^(ROOT stat+)
;
stat
: expr ';'
;
expr
: Id Assign expr -> ^(Assign Id expr)
| add
;
add
: mult (('+' | '-')^ mult)*
;
mult
: atom (('*' | '/')^ atom)*
;
atom
: Id
| Num
| '('! expr ')' !
;
Assign : '=' ;
Comment : '//' ~('\r' | '\n')* {skip();};
Id : 'a'..'z'+;
Num : '0'..'9'+;
Space : (' ' | '\t' | '\r' | '\n')+ {skip();};
And for the input:
a=b=4;
a = 2 * (b = 1);
you get following parse tree:
The key here is that you need to "assure" the parser that inside an expression, there is something ahead that satisfies the expression. This can be done using a syntactic predicate (the ( ... )=> parts in the add and mult rules).
A quick demo:
grammar TL;
options {
output=AST;
}
tokens {
ROOT;
ASSIGN;
}
parse
: stat* EOF -> ^(ROOT stat+)
;
stat
: expr ';' -> expr
;
expr
: add
;
add
: mult ((('+' | '-') mult)=> ('+' | '-')^ mult)*
;
mult
: atom ((('*' | '/') atom)=> ('*' | '/')^ atom)*
;
atom
: (Id -> Id) ('=' expr -> ^(ASSIGN Id expr))?
| Num
| '(' expr ')' -> expr
;
Comment : '//' ~('\r' | '\n')* {skip();};
Id : 'a'..'z'+;
Num : '0'..'9'+;
Space : (' ' | '\t' | '\r' | '\n')+ {skip();};
which will parse the input:
a = b = 1; // -> a = (b = 1)
a = 2 * (b = 1); // contrived but valid
into the following AST:
I'm parsing CoCo/R grammars in a utility to automate CoCo -> ANTLR translation. The core ANTLR grammar is:
rule '=' expression '.' ;
expression
: term ('|' term)*
-> ^( OR_EXPR term term* )
;
term
: (factor (factor)*)? ;
factor
: symbol
| '(' expression ')'
-> ^( GROUPED_EXPR expression )
| '[' expression']'
-> ^( OPTIONAL_EXPR expression)
| '{' expression '}'
-> ^( SEQUENCE_EXPR expression)
;
symbol
: IF_ACTION
| ID (ATTRIBUTES)?
| STRINGLITERAL
;
My problem is with constructions such as these:
CS = { ExternAliasDirective }
{ UsingDirective }
EOF .
CS results in an AST with a OR_EXPR node although no '|' character
actually appears. I'm sure this is due to the definition of
expression but I cannot see any other way to write the rules.
I did experiment with this to resolve the ambiguity.
// explicitly test for the presence of an '|' character
expression
#init { bool ored = false; }
: term {ored = (input.LT(1).Type == OR); } (OR term)*
-> {ored}? ^(OR_EXPR term term*)
-> ^(LIST term term*)
It works but the hack reinforces my conviction that something fundamental is wrong.
Any tips much appreciated.
Your rule:
expression
: term ('|' term)*
-> ^( OR_EXPR term term* )
;
always causes the rewrite rule to create a tree with a root of type OR_EXPR. You can create "sub rewrite rules" like this:
expression
: (term -> REWRITE_RULE_X) ('|' term -> ^(REWRITE_RULE_Y))*
;
And to resolve the ambiguity in your grammar, it's easiest to enable global backtracking which can be done in the options { ... } section of your grammar.
A quick demo:
grammar CocoR;
options {
output=AST;
backtrack=true;
}
tokens {
RULE;
GROUP;
SEQUENCE;
OPTIONAL;
OR;
ATOMS;
}
parse
: rule EOF -> rule
;
rule
: ID '=' expr* '.' -> ^(RULE ID expr*)
;
expr
: (a=atoms -> $a) ('|' b=atoms -> ^(OR $expr $b))*
;
atoms
: atom+ -> ^(ATOMS atom+)
;
atom
: ID
| '(' expr ')' -> ^(GROUP expr)
| '{' expr '}' -> ^(SEQUENCE expr)
| '[' expr ']' -> ^(OPTIONAL expr)
;
ID
: ('a'..'z' | 'A'..'Z') ('a'..'z' | 'A'..'Z' | '0'..'9')*
;
Space
: (' ' | '\t' | '\r' | '\n') {skip();}
;
with input:
CS = { ExternAliasDirective }
{ UsingDirective }
EOF .
produces the AST:
and the input:
foo = a | b ({c} | d [e f]) .
produces:
The class to test this:
import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;
import org.antlr.stringtemplate.*;
public class Main {
public static void main(String[] args) throws Exception {
/*
String source =
"CS = { ExternAliasDirective } \n" +
"{ UsingDirective } \n" +
"EOF . ";
*/
String source = "foo = a | b ({c} | d [e f]) .";
ANTLRStringStream in = new ANTLRStringStream(source);
CocoRLexer lexer = new CocoRLexer(in);
CommonTokenStream tokens = new CommonTokenStream(lexer);
CocoRParser parser = new CocoRParser(tokens);
CocoRParser.parse_return returnValue = parser.parse();
CommonTree tree = (CommonTree)returnValue.getTree();
DOTTreeGenerator gen = new DOTTreeGenerator();
StringTemplate st = gen.toDOT(tree);
System.out.println(st);
}
}
and with the output this class produces, I used the following website to create the AST-images: http://graph.gafol.net/
HTH
EDIT
To account for epsilon (empty string) in your OR expressions, you might try something (quickly tested!) like this:
expr
: (a=atoms -> $a) ( ( '|' b=atoms -> ^(OR $expr $b)
| '|' -> ^(OR $expr NOTHING)
)
)*
;
which parses the source:
foo = a | b | .
into the following AST:
The production for expression explicitly says that it can only return an OR_EXPR node. You can try something like:
expression
:
term
|
term ('|' term)+
-> ^( OR_EXPR term term* )
;
Further down, you could use:
term
: factor*;
I'm working on a simple string manipulation DSL for internal purposes, and I would like the language to support string interpolation as it is used in Ruby.
For example:
name = "Bob"
msg = "Hello ${name}!"
print(msg) # prints "Hello Bob!"
I'm attempting to implement my parser in ANTLRv3, but I'm pretty inexperienced with using ANTLR so I'm unsure how to implement this feature. So far, I've specified my string literals in the lexer, but in this case I'll obviously need to handle the interpolation content in the parser.
My current string literal grammar looks like this:
STRINGLITERAL : '"' ( StringEscapeSeq | ~( '\\' | '"' | '\r' | '\n' ) )* '"' ;
fragment StringEscapeSeq : '\\' ( 't' | 'n' | 'r' | '"' | '\\' | '$' | ('0'..'9')) ;
Moving the string literal handling into the parser seems to make everything else stop working as it should. Cursory web searches didn't yield any information. Any suggestions as to how to get started on this?
I'm no ANTLR expert, but here's a possible grammar:
grammar Str;
parse
: ((Space)* statement (Space)* ';')+ (Space)* EOF
;
statement
: print | assignment
;
print
: 'print' '(' (Identifier | stringLiteral) ')'
;
assignment
: Identifier (Space)* '=' (Space)* stringLiteral
;
stringLiteral
: '"' (Identifier | EscapeSequence | NormalChar | Space | Interpolation)* '"'
;
Interpolation
: '${' Identifier '}'
;
Identifier
: ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | '0'..'9')*
;
EscapeSequence
: '\\' SpecialChar
;
SpecialChar
: '"' | '\\' | '$'
;
Space
: (' ' | '\t' | '\r' | '\n')
;
NormalChar
: ~SpecialChar
;
As you notice, there are a couple of (Space)*-es inside the example grammar. This is because the stringLiteral is a parser-rule instead of a lexer-rule. Therefor, when tokenizing the source file, the lexer cannot know if a white space is part of a string literal, or is just a space inside the source file that can be ignored.
I tested the example with a little Java class and all worked as expected:
/* the same grammar, but now with a bit of Java code in it */
grammar Str;
#parser::header {
package antlrdemo;
import java.util.HashMap;
}
#lexer::header {
package antlrdemo;
}
#parser::members {
HashMap<String, String> vars = new HashMap<String, String>();
}
parse
: ((Space)* statement (Space)* ';')+ (Space)* EOF
;
statement
: print | assignment
;
print
: 'print' '('
( id=Identifier {System.out.println("> "+vars.get($id.text));}
| st=stringLiteral {System.out.println("> "+$st.value);}
)
')'
;
assignment
: id=Identifier (Space)* '=' (Space)* st=stringLiteral {vars.put($id.text, $st.value);}
;
stringLiteral returns [String value]
: '"'
{StringBuilder b = new StringBuilder();}
( id=Identifier {b.append($id.text);}
| es=EscapeSequence {b.append($es.text);}
| ch=(NormalChar | Space) {b.append($ch.text);}
| in=Interpolation {b.append(vars.get($in.text.substring(2, $in.text.length()-1)));}
)*
'"'
{$value = b.toString();}
;
Interpolation
: '${' i=Identifier '}'
;
Identifier
: ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | '0'..'9')*
;
EscapeSequence
: '\\' SpecialChar
;
SpecialChar
: '"' | '\\' | '$'
;
Space
: (' ' | '\t' | '\r' | '\n')
;
NormalChar
: ~SpecialChar
;
And a class with a main method to test it all:
package antlrdemo;
import org.antlr.runtime.*;
public class ANTLRDemo {
public static void main(String[] args) throws RecognitionException {
String source = "name = \"Bob\"; \n"+
"msg = \"Hello ${name}\"; \n"+
"print(msg); \n"+
"print(\"Bye \\${for} now!\"); ";
ANTLRStringStream in = new ANTLRStringStream(source);
StrLexer lexer = new StrLexer(in);
CommonTokenStream tokens = new CommonTokenStream(lexer);
StrParser parser = new StrParser(tokens);
parser.parse();
}
}
which produces the following output:
> Hello Bob
> Bye \${for} now!
Again, I am no expert, but this (at least) gives you a way to solve it.
HTH.