我想解析规则手册"demo.rb"文件,如下所示:
rulebook Titanic-Normalization {
version 1
meta {
description "Test"
source "my-rules.xslx"
user "joltie"
}
rule remove-first-line {
description "Removes first line when offset is zero"
when(present(offset) && offset == 0) then {
filter-row-if-true true;
}
}
}
我编写了ANTLR4语法文件Rulebook.g4,如下所示。目前,它通常可以很好地解析 *.rb 文件,但在遇到"表达式"/"语句"规则时会抛出意外错误。
grammar Rulebook;
rulebookStatement
: KWRulebook
(GeneralIdentifier | Identifier)
'{'
KWVersion
VersionConstant
metaStatement
(ruleStatement)+
'}'
;
metaStatement
: KWMeta
'{'
KWDescription
StringLiteral
KWSource
StringLiteral
KWUser
StringLiteral
'}'
;
ruleStatement
: KWRule
(GeneralIdentifier | Identifier)
'{'
KWDescription
StringLiteral
whenThenStatement
'}'
;
whenThenStatement
: KWWhen '(' expression ')'
KWThen '{' statement '}'
;
primaryExpression
: GeneralIdentifier
| Identifier
| StringLiteral+
| '(' expression ')'
;
postfixExpression
: primaryExpression
| postfixExpression '[' expression ']'
| postfixExpression '(' argumentExpressionList? ')'
| postfixExpression '.' Identifier
| postfixExpression '->' Identifier
| postfixExpression '++'
| postfixExpression '--'
;
argumentExpressionList
: assignmentExpression
| argumentExpressionList ',' assignmentExpression
;
unaryExpression
: postfixExpression
| '++' unaryExpression
| '--' unaryExpression
| unaryOperator castExpression
;
unaryOperator
: '&' | '*' | '+' | '-' | '~' | '!'
;
castExpression
: unaryExpression
| DigitSequence // for
;
multiplicativeExpression
: castExpression
| multiplicativeExpression '*' castExpression
| multiplicativeExpression '/' castExpression
| multiplicativeExpression '%' castExpression
;
additiveExpression
: multiplicativeExpression
| additiveExpression '+' multiplicativeExpression
| additiveExpression '-' multiplicativeExpression
;
shiftExpression
: additiveExpression
| shiftExpression '<<' additiveExpression
| shiftExpression '>>' additiveExpression
;
relationalExpression
: shiftExpression
| relationalExpression '<' shiftExpression
| relationalExpression '>' shiftExpression
| relationalExpression '<=' shiftExpression
| relationalExpression '>=' shiftExpression
;
equalityExpression
: relationalExpression
| equalityExpression '==' relationalExpression
| equalityExpression '!=' relationalExpression
;
andExpression
: equalityExpression
| andExpression '&' equalityExpression
;
exclusiveOrExpression
: andExpression
| exclusiveOrExpression '^' andExpression
;
inclusiveOrExpression
: exclusiveOrExpression
| inclusiveOrExpression '|' exclusiveOrExpression
;
logicalAndExpression
: inclusiveOrExpression
| logicalAndExpression '&&' inclusiveOrExpression
;
logicalOrExpression
: logicalAndExpression
| logicalOrExpression '||' logicalAndExpression
;
conditionalExpression
: logicalOrExpression ('?' expression ':' conditionalExpression)?
;
assignmentExpression
: conditionalExpression
| unaryExpression assignmentOperator assignmentExpression
| DigitSequence // for
;
assignmentOperator
: '=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '&=' | '^=' | '|='
;
expression
: assignmentExpression
| expression ',' assignmentExpression
;
statement
: expressionStatement
;
expressionStatement
: expression+ ';'
;
KWRulebook: 'rulebook';
KWVersion: 'version';
KWMeta: 'meta';
KWDescription: 'description';
KWSource: 'source';
KWUser: 'user';
KWRule: 'rule';
KWWhen: 'when';
KWThen: 'then';
KWTrue: 'true';
KWFalse: 'false';
fragment
LeftParen : '(';
fragment
RightParen : ')';
fragment
LeftBracket : '[';
fragment
RightBracket : ']';
fragment
LeftBrace : '{';
fragment
RightBrace : '}';
Identifier
: IdentifierNondigit
( IdentifierNondigit
| Digit
)*
;
GeneralIdentifier
: Identifier
('-' Identifier)+
;
fragment
IdentifierNondigit
: Nondigit
//| // other implementation-defined characters...
;
VersionConstant
: DigitSequence ('.' DigitSequence)*
;
DigitSequence
: Digit+
;
fragment
Nondigit
: [a-zA-Z_]
;
fragment
Digit
: [0-9]
;
StringLiteral
: '"' SCharSequence? '"'
| ''' SCharSequence? '''
;
fragment
SCharSequence
: SChar+
;
fragment
SChar
: ~["\rn]
| '\n' // Added line
| '\rn' // Added line
;
Whitespace
: [ t]+
-> skip
;
Newline
: ( 'r' 'n'?
| 'n'
)
-> skip
;
BlockComment
: '/*' .*? '*/'
-> skip
;
LineComment
: '//' ~[rn]*
-> skip
;
我用如下所示的单元测试测试了规则手册解析器:
public void testScanRulebookFile() throws IOException {
String fileName = "C:\rulebooks\demo.rb";
FileInputStream fis = new FileInputStream(fileName);
// create a CharStream that reads from standard input
CharStream input = CharStreams.fromStream(fis);
// create a lexer that feeds off of input CharStream
RulebookLexer lexer = new RulebookLexer(input);
// create a buffer of tokens pulled from the lexer
CommonTokenStream tokens = new CommonTokenStream(lexer);
// create a parser that feeds off the tokens buffer
RulebookParser parser = new RulebookParser(tokens);
RulebookStatementContext context = parser.rulebookStatement();
// WhenThenStatementContext context = parser.whenThenStatement();
System.out.println(context.toStringTree(parser));
// ParseTree tree = parser.getContext(); // begin parsing at init rule
// System.out.println(tree.toStringTree(parser)); // print LISP-style tree
}
对于上面的"demo.rb",解析器得到的错误如下。我还打印 RulebookStatementContext 作为 toStringTree。
line 12:25 mismatched input '&&' expecting ')'
(rulebookStatement rulebook Titanic-Normalization { version 1 (metaStatement meta { description "Test" source "my-rules.xslx" user "joltie" }) (ruleStatement rule remove-first-line { description "Removes first line when offset is zero" (whenThenStatement when ( (expression (assignmentExpression (conditionalExpression (logicalOrExpression (logicalAndExpression (inclusiveOrExpression (exclusiveOrExpression (andExpression (equalityExpression (relationalExpression (shiftExpression (additiveExpression (multiplicativeExpression (castExpression (unaryExpression (postfixExpression (postfixExpression (primaryExpression present)) ( (argumentExpressionList (assignmentExpression (conditionalExpression (logicalOrExpression (logicalAndExpression (inclusiveOrExpression (exclusiveOrExpression (andExpression (equalityExpression (relationalExpression (shiftExpression (additiveExpression (multiplicativeExpression (castExpression (unaryExpression (postfixExpression (primaryExpression offset))))))))))))))))) ))))))))))))))))) && offset == 0 ) then { filter-row-if-true true ;) }) })
我还编写单元测试来测试短输入上下文,例如"when (offset == 0) then {n" + "filter-row-if-true true;n" + "}n"
来调试问题。但它仍然得到这样的错误:
line 1:16 mismatched input '0' expecting {'(', '++', '--', '&&', '&', '*', '+', '-', '~', '!', Identifier, GeneralIdentifier, DigitSequence, StringLiteral}
line 2:19 extraneous input 'true' expecting {'(', '++', '--', '&&', '&', '*', '+', '-', '~', '!', ';', Identifier, GeneralIdentifier, DigitSequence, StringLiteral}
经过两天的尝试,我没有任何进展。问题如上,请有人给我一些关于如何调试ANTLR4语法无关/不匹配输入错误的建议。
我不知道是否有更复杂的方法来调试语法/解析器,但这是我通常的做法:
-
将导致问题的输入减少到尽可能少的字符 可能。
-
尽可能减少语法,使其仍然在相应的输入上产生相同的错误(大多数情况下,这意味着通过回收原始语法的规则(尽可能简化(为减少的输入编写最小的语法(
-
确保词法分析器正确分段输入(为此,ANTLRWorks 中显示词法分析器输出的功能很棒(
-
看看解析树。ANTLR的testRig有一个功能,以图形方式显示ParseTree(你可以通过ANTLRWorks或ANTLR的
。TreeViewer
访问这个功能(,所以你可以看看解析器的解释与你所拥有的解释不同的地方 -
"手动"进行解析。这意味着您将采用语法并自己逐步完成输入,并在此过程中尝试不应用逻辑或假设/知识/等。只需按照自己的语法进行操作,就像计算机一样。质疑您采取的每一步(是否有另一种方法来匹配输入(,并始终尝试以另一种方式匹配输入,而不是您实际希望解析它的方式
尝试修复最小语法中的错误,然后将解决方案迁移到真正的语法。
除了 Raven 答案之外,我还使用了 ANTLR 4 的 Intellij 12+ 插件,它为我节省了大量调试语法的精力。我有一个非常简单的错误(浮点规则中的未转义点 . 而不是浮点规则中的"."(,我找不到。该工具允许选择语法的任何解析器规则,使用输入对其进行测试并以图形方式显示解析树。直到我开始寻找调试语法的方法,我才注意到它具有这个非常有用的功能。强烈推荐。
更新 g4 文件以修复解析错误
grammar Rulebook;
@header {
package com.someone.commons.rulebook.parser;
}
rulebookStatement
: KWRulebook
(GeneralIdentifier | Identifier)
'{'
KWVersion
VersionConstant
metaStatement
(ruleStatement)+
'}'
;
metaStatement
: KWMeta
'{'
KWDescription
StringLiteral
KWSource
StringLiteral
KWUser
StringLiteral
'}'
;
ruleStatement
: KWRule
(GeneralIdentifier | Identifier)
'{'
KWDescription
StringLiteral
whenThenStatement
'}'
;
whenThenStatement
: KWWhen '(' expression ')'
KWThen '{' (statement)* '}'
;
primaryExpression
: GeneralIdentifier
| Identifier
| StringLiteral+
| Constant
| '(' expression ')'
| '[' expression ']'
;
postfixExpression
: primaryExpression
| postfixExpression '[' expression ']'
| postfixExpression '(' argumentExpressionList? ')'
| postfixExpression '.' Identifier
| postfixExpression '->' Identifier
| postfixExpression '++'
| postfixExpression '--'
;
argumentExpressionList
: assignmentExpression
| argumentExpressionList ',' assignmentExpression
;
unaryExpression
: postfixExpression
| '++' unaryExpression
| '--' unaryExpression
| unaryOperator castExpression
;
unaryOperator
: '&' | '*' | '+' | '-' | '~' | '!'
;
castExpression
: unaryExpression
;
multiplicativeExpression
: castExpression
| multiplicativeExpression '*' castExpression
| multiplicativeExpression '/' castExpression
| multiplicativeExpression '%' castExpression
;
additiveExpression
: multiplicativeExpression
| additiveExpression '+' multiplicativeExpression
| additiveExpression '-' multiplicativeExpression
;
shiftExpression
: additiveExpression
| shiftExpression '<<' additiveExpression
| shiftExpression '>>' additiveExpression
;
relationalExpression
: shiftExpression
| relationalExpression '<' shiftExpression
| relationalExpression '>' shiftExpression
| relationalExpression '<=' shiftExpression
| relationalExpression '>=' shiftExpression
;
equalityExpression
: relationalExpression
| equalityExpression '==' relationalExpression
| equalityExpression '!=' relationalExpression
;
andExpression
: equalityExpression
| andExpression '&' equalityExpression
;
exclusiveOrExpression
: andExpression
| exclusiveOrExpression '^' andExpression
;
inclusiveOrExpression
: exclusiveOrExpression
| inclusiveOrExpression '|' exclusiveOrExpression
;
logicalAndExpression
: inclusiveOrExpression
| logicalAndExpression '&&' inclusiveOrExpression
;
logicalOrExpression
: logicalAndExpression
| logicalOrExpression '||' logicalAndExpression
;
conditionalExpression
: logicalOrExpression ('?' expression? ':' conditionalExpression)?
;
assignmentExpression
: conditionalExpression
| unaryExpression assignmentOperator assignmentExpression
;
assignmentOperator
: '=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '&=' | '^=' | '|='
;
expression
: assignmentExpression
| expression ',' assignmentExpression
;
statement
: expressionStatement
;
expressionStatement
: expression+ ';'
;
KWRulebook: 'rulebook';
KWVersion: 'version';
KWMeta: 'meta';
KWDescription: 'description';
KWSource: 'source';
KWUser: 'user';
KWRule: 'rule';
KWWhen: 'when';
KWThen: 'then';
Identifier
: IdentifierNondigit
( IdentifierNondigit
| Digit
)*
;
GeneralIdentifier
: Identifier
( '-'
| '.'
| IdentifierNondigit
| Digit
)*
;
fragment
IdentifierNondigit
: Nondigit
//| // other implementation-defined characters...
;
VersionConstant
: DigitSequence ('.' DigitSequence)*
;
Constant
: IntegerConstant
| FloatingConstant
;
fragment
IntegerConstant
: DecimalConstant
;
fragment
DecimalConstant
: NonzeroDigit Digit*
;
fragment
FloatingConstant
: DecimalFloatingConstant
;
fragment
DecimalFloatingConstant
: FractionalConstant
;
fragment
FractionalConstant
: DigitSequence? '.' DigitSequence
| DigitSequence '.'
;
fragment
DigitSequence
: Digit+
;
fragment
Nondigit
: [a-zA-Z_]
;
fragment
Digit
: [0-9]
;
fragment
NonzeroDigit
: [1-9]
;
StringLiteral
: '"' SCharSequence? '"'
| ''' SCharSequence? '''
;
fragment
SCharSequence
: SChar+
;
fragment
SChar
: ~["\rn]
| '\n' // Added line
| '\rn' // Added line
;
Whitespace
: [ t]+
-> skip
;
Newline
: ( 'r' 'n'?
| 'n'
)
-> skip
;
BlockComment
: '/*' .*? '*/'
-> skip
;
LineComment
: '//' ~[rn]*
-> skip
;