1456 lines
		
	
	
		
			49 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			1456 lines
		
	
	
		
			49 KiB
		
	
	
	
		
			Python
		
	
	
	
| """ANTLR3 runtime package"""
 | |
| 
 | |
| # begin[licence]
 | |
| #
 | |
| # [The "BSD licence"]
 | |
| # Copyright (c) 2005-2012 Terence Parr
 | |
| # All rights reserved.
 | |
| #
 | |
| # Redistribution and use in source and binary forms, with or without
 | |
| # modification, are permitted provided that the following conditions
 | |
| # are met:
 | |
| # 1. Redistributions of source code must retain the above copyright
 | |
| #    notice, this list of conditions and the following disclaimer.
 | |
| # 2. Redistributions in binary form must reproduce the above copyright
 | |
| #    notice, this list of conditions and the following disclaimer in the
 | |
| #    documentation and/or other materials provided with the distribution.
 | |
| # 3. The name of the author may not be used to endorse or promote products
 | |
| #    derived from this software without specific prior written permission.
 | |
| #
 | |
| # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 | |
| # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 | |
| # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 | |
| # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 | |
| # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 | |
| # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 | |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 | |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 | |
| # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| #
 | |
| # end[licence]
 | |
| 
 | |
| import sys
 | |
| import inspect
 | |
| 
 | |
| from .constants import compatible_api_versions, DEFAULT_CHANNEL, \
 | |
|      HIDDEN_CHANNEL, EOF, EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE
 | |
| from .exceptions import RecognitionException, MismatchedTokenException, \
 | |
|      MismatchedRangeException, MismatchedTreeNodeException, \
 | |
|      NoViableAltException, EarlyExitException, MismatchedSetException, \
 | |
|      MismatchedNotSetException, FailedPredicateException, \
 | |
|      BacktrackingFailed, UnwantedTokenException, MissingTokenException
 | |
| from .tokens import CommonToken, SKIP_TOKEN
 | |
| 
 | |
| 
 | |
| class RecognizerSharedState(object):
 | |
|     """
 | |
|     The set of fields needed by an abstract recognizer to recognize input
 | |
|     and recover from errors etc...  As a separate state object, it can be
 | |
|     shared among multiple grammars; e.g., when one grammar imports another.
 | |
| 
 | |
|     These fields are publically visible but the actual state pointer per
 | |
|     parser is protected.
 | |
|     """
 | |
| 
 | |
|     def __init__(self):
 | |
|         # Track the set of token types that can follow any rule invocation.
 | |
|         # Stack grows upwards.
 | |
|         self.following = []
 | |
| 
 | |
|         # This is true when we see an error and before having successfully
 | |
|         # matched a token.  Prevents generation of more than one error message
 | |
|         # per error.
 | |
|         self.errorRecovery = False
 | |
| 
 | |
|         # The index into the input stream where the last error occurred.
 | |
|         # This is used to prevent infinite loops where an error is found
 | |
|         # but no token is consumed during recovery...another error is found,
 | |
|         # ad naseum.  This is a failsafe mechanism to guarantee that at least
 | |
|         # one token/tree node is consumed for two errors.
 | |
|         self.lastErrorIndex = -1
 | |
| 
 | |
|         # If 0, no backtracking is going on.  Safe to exec actions etc...
 | |
|         # If >0 then it's the level of backtracking.
 | |
|         self.backtracking = 0
 | |
| 
 | |
|         # An array[size num rules] of (int -> int) dicts that tracks
 | |
|         # the stop token index for each rule.  ruleMemo[ruleIndex] is
 | |
|         # the memoization table for ruleIndex.  For key ruleStartIndex, you
 | |
|         # get back the stop token for associated rule or MEMO_RULE_FAILED.
 | |
|         #
 | |
|         # This is only used if rule memoization is on (which it is by default).
 | |
|         self.ruleMemo = None
 | |
| 
 | |
|         ## Did the recognizer encounter a syntax error?  Track how many.
 | |
|         self.syntaxErrors = 0
 | |
| 
 | |
| 
 | |
|         # LEXER FIELDS (must be in same state object to avoid casting
 | |
|         # constantly in generated code and Lexer object) :(
 | |
| 
 | |
| 
 | |
|         ## The goal of all lexer rules/methods is to create a token object.
 | |
|         # This is an instance variable as multiple rules may collaborate to
 | |
|         # create a single token.  nextToken will return this object after
 | |
|         # matching lexer rule(s).  If you subclass to allow multiple token
 | |
|         # emissions, then set this to the last token to be matched or
 | |
|         # something nonnull so that the auto token emit mechanism will not
 | |
|         # emit another token.
 | |
|         self.token = None
 | |
| 
 | |
|         ## What character index in the stream did the current token start at?
 | |
|         # Needed, for example, to get the text for current token.  Set at
 | |
|         # the start of nextToken.
 | |
|         self.tokenStartCharIndex = -1
 | |
| 
 | |
|         ## The line on which the first character of the token resides
 | |
|         self.tokenStartLine = None
 | |
| 
 | |
|         ## The character position of first character within the line
 | |
|         self.tokenStartCharPositionInLine = None
 | |
| 
 | |
|         ## The channel number for the current token
 | |
|         self.channel = None
 | |
| 
 | |
|         ## The token type for the current token
 | |
|         self.type = None
 | |
| 
 | |
|         ## You can set the text for the current token to override what is in
 | |
|         # the input char buffer.  Use setText() or can set this instance var.
 | |
|         self.text = None
 | |
| 
 | |
| 
 | |
| class BaseRecognizer(object):
 | |
|     """
 | |
|     @brief Common recognizer functionality.
 | |
| 
 | |
|     A generic recognizer that can handle recognizers generated from
 | |
|     lexer, parser, and tree grammars.  This is all the parsing
 | |
|     support code essentially; most of it is error recovery stuff and
 | |
|     backtracking.
 | |
|     """
 | |
| 
 | |
|     MEMO_RULE_FAILED = -2
 | |
|     MEMO_RULE_UNKNOWN = -1
 | |
| 
 | |
|     # copies from Token object for convenience in actions
 | |
|     DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
 | |
| 
 | |
|     # for convenience in actions
 | |
|     HIDDEN = HIDDEN_CHANNEL
 | |
| 
 | |
|     # overridden by generated subclasses
 | |
|     grammarFileName = None
 | |
|     tokenNames = None
 | |
| 
 | |
|     # The api_version attribute has been introduced in 3.3. If it is not
 | |
|     # overwritten in the generated recognizer, we assume a default of v0.
 | |
|     api_version = 0
 | |
| 
 | |
|     def __init__(self, state=None):
 | |
|         # Input stream of the recognizer. Must be initialized by a subclass.
 | |
|         self.input = None
 | |
| 
 | |
|         ## State of a lexer, parser, or tree parser are collected into a state
 | |
|         # object so the state can be shared.  This sharing is needed to
 | |
|         # have one grammar import others and share same error variables
 | |
|         # and other state variables.  It's a kind of explicit multiple
 | |
|         # inheritance via delegation of methods and shared state.
 | |
|         if state is None:
 | |
|             state = RecognizerSharedState()
 | |
|         self._state = state
 | |
| 
 | |
|         if self.api_version not in compatible_api_versions:
 | |
|             raise RuntimeError(
 | |
|                 "ANTLR version mismatch: "
 | |
|                 "The recognizer has been generated with API V{}, "
 | |
|                 "but this runtime does not support this."
 | |
|                 .format(self.api_version))
 | |
| 
 | |
|     # this one only exists to shut up pylint :(
 | |
|     def setInput(self, input):
 | |
|         self.input = input
 | |
| 
 | |
| 
 | |
|     def reset(self):
 | |
|         """
 | |
|         reset the parser's state; subclasses must rewind the input stream
 | |
|         """
 | |
| 
 | |
|         # wack everything related to error recovery
 | |
|         if self._state is None:
 | |
|             # no shared state work to do
 | |
|             return
 | |
| 
 | |
|         self._state.following = []
 | |
|         self._state.errorRecovery = False
 | |
|         self._state.lastErrorIndex = -1
 | |
|         self._state.syntaxErrors = 0
 | |
|         # wack everything related to backtracking and memoization
 | |
|         self._state.backtracking = 0
 | |
|         if self._state.ruleMemo is not None:
 | |
|             self._state.ruleMemo = {}
 | |
| 
 | |
| 
 | |
|     def match(self, input, ttype, follow):
 | |
|         """
 | |
|         Match current input symbol against ttype.  Attempt
 | |
|         single token insertion or deletion error recovery.  If
 | |
|         that fails, throw MismatchedTokenException.
 | |
| 
 | |
|         To turn off single token insertion or deletion error
 | |
|         recovery, override recoverFromMismatchedToken() and have it
 | |
|         throw an exception. See TreeParser.recoverFromMismatchedToken().
 | |
|         This way any error in a rule will cause an exception and
 | |
|         immediate exit from rule.  Rule would recover by resynchronizing
 | |
|         to the set of symbols that can follow rule ref.
 | |
|         """
 | |
| 
 | |
|         matchedSymbol = self.getCurrentInputSymbol(input)
 | |
|         if self.input.LA(1) == ttype:
 | |
|             self.input.consume()
 | |
|             self._state.errorRecovery = False
 | |
|             return matchedSymbol
 | |
| 
 | |
|         if self._state.backtracking > 0:
 | |
|             # FIXME: need to return matchedSymbol here as well. damn!!
 | |
|             raise BacktrackingFailed
 | |
| 
 | |
|         matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow)
 | |
|         return matchedSymbol
 | |
| 
 | |
| 
 | |
|     def matchAny(self):
 | |
|         """Match the wildcard: in a symbol"""
 | |
| 
 | |
|         self._state.errorRecovery = False
 | |
|         self.input.consume()
 | |
| 
 | |
| 
 | |
|     def mismatchIsUnwantedToken(self, input, ttype):
 | |
|         return input.LA(2) == ttype
 | |
| 
 | |
| 
 | |
|     def mismatchIsMissingToken(self, input, follow):
 | |
|         if follow is None:
 | |
|             # we have no information about the follow; we can only consume
 | |
|             # a single token and hope for the best
 | |
|             return False
 | |
| 
 | |
|         # compute what can follow this grammar element reference
 | |
|         if EOR_TOKEN_TYPE in follow:
 | |
|             viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW()
 | |
|             follow |= viableTokensFollowingThisRule
 | |
| 
 | |
|             if len(self._state.following) > 0:
 | |
|                 # remove EOR if we're not the start symbol
 | |
|                 follow -= {EOR_TOKEN_TYPE}
 | |
| 
 | |
|         # if current token is consistent with what could come after set
 | |
|         # then we know we're missing a token; error recovery is free to
 | |
|         # "insert" the missing token
 | |
|         if input.LA(1) in follow or EOR_TOKEN_TYPE in follow:
 | |
|             return True
 | |
| 
 | |
|         return False
 | |
| 
 | |
| 
 | |
|     def reportError(self, e):
 | |
|         """Report a recognition problem.
 | |
| 
 | |
|         This method sets errorRecovery to indicate the parser is recovering
 | |
|         not parsing.  Once in recovery mode, no errors are generated.
 | |
|         To get out of recovery mode, the parser must successfully match
 | |
|         a token (after a resync).  So it will go:
 | |
| 
 | |
|         1. error occurs
 | |
|         2. enter recovery mode, report error
 | |
|         3. consume until token found in resync set
 | |
|         4. try to resume parsing
 | |
|         5. next match() will reset errorRecovery mode
 | |
| 
 | |
|         If you override, make sure to update syntaxErrors if you care about
 | |
|         that.
 | |
| 
 | |
|         """
 | |
| 
 | |
|         # if we've already reported an error and have not matched a token
 | |
|         # yet successfully, don't report any errors.
 | |
|         if self._state.errorRecovery:
 | |
|             return
 | |
| 
 | |
|         self._state.syntaxErrors += 1 # don't count spurious
 | |
|         self._state.errorRecovery = True
 | |
| 
 | |
|         self.displayRecognitionError(e)
 | |
| 
 | |
| 
 | |
|     def displayRecognitionError(self, e):
 | |
|         hdr = self.getErrorHeader(e)
 | |
|         msg = self.getErrorMessage(e)
 | |
|         self.emitErrorMessage(hdr + " " + msg)
 | |
| 
 | |
| 
 | |
|     def getErrorMessage(self, e):
 | |
|         """
 | |
|         What error message should be generated for the various
 | |
|         exception types?
 | |
| 
 | |
|         Not very object-oriented code, but I like having all error message
 | |
|         generation within one method rather than spread among all of the
 | |
|         exception classes. This also makes it much easier for the exception
 | |
|         handling because the exception classes do not have to have pointers back
 | |
|         to this object to access utility routines and so on. Also, changing
 | |
|         the message for an exception type would be difficult because you
 | |
|         would have to subclassing exception, but then somehow get ANTLR
 | |
|         to make those kinds of exception objects instead of the default.
 | |
|         This looks weird, but trust me--it makes the most sense in terms
 | |
|         of flexibility.
 | |
| 
 | |
|         For grammar debugging, you will want to override this to add
 | |
|         more information such as the stack frame with
 | |
|         getRuleInvocationStack(e, this.getClass().getName()) and,
 | |
|         for no viable alts, the decision description and state etc...
 | |
| 
 | |
|         Override this to change the message generated for one or more
 | |
|         exception types.
 | |
|         """
 | |
| 
 | |
|         if isinstance(e, UnwantedTokenException):
 | |
|             if e.expecting == EOF:
 | |
|                 tokenName = "EOF"
 | |
|             else:
 | |
|                 tokenName = self.tokenNames[e.expecting]
 | |
| 
 | |
|             msg = "extraneous input {} expecting {}".format(
 | |
|                 self.getTokenErrorDisplay(e.getUnexpectedToken()),
 | |
|                 tokenName
 | |
|                 )
 | |
| 
 | |
|         elif isinstance(e, MissingTokenException):
 | |
|             if e.expecting == EOF:
 | |
|                 tokenName = "EOF"
 | |
|             else:
 | |
|                 tokenName = self.tokenNames[e.expecting]
 | |
| 
 | |
|             msg = "missing {} at {}".format(
 | |
|                 tokenName, self.getTokenErrorDisplay(e.token)
 | |
|                 )
 | |
| 
 | |
|         elif isinstance(e, MismatchedTokenException):
 | |
|             if e.expecting == EOF:
 | |
|                 tokenName = "EOF"
 | |
|             else:
 | |
|                 tokenName = self.tokenNames[e.expecting]
 | |
| 
 | |
|             msg = "mismatched input {} expecting {}".format(
 | |
|                 self.getTokenErrorDisplay(e.token),
 | |
|                 tokenName
 | |
|                 )
 | |
| 
 | |
|         elif isinstance(e, MismatchedTreeNodeException):
 | |
|             if e.expecting == EOF:
 | |
|                 tokenName = "EOF"
 | |
|             else:
 | |
|                 tokenName = self.tokenNames[e.expecting]
 | |
| 
 | |
|             msg = "mismatched tree node: {} expecting {}".format(
 | |
|                 e.node, tokenName)
 | |
| 
 | |
|         elif isinstance(e, NoViableAltException):
 | |
|             msg = "no viable alternative at input {}".format(
 | |
|                 self.getTokenErrorDisplay(e.token))
 | |
| 
 | |
|         elif isinstance(e, EarlyExitException):
 | |
|             msg = "required (...)+ loop did not match anything at input {}".format(
 | |
|                 self.getTokenErrorDisplay(e.token))
 | |
| 
 | |
|         elif isinstance(e, MismatchedSetException):
 | |
|             msg = "mismatched input {} expecting set {!r}".format(
 | |
|                 self.getTokenErrorDisplay(e.token),
 | |
|                 e.expecting
 | |
|                 )
 | |
| 
 | |
|         elif isinstance(e, MismatchedNotSetException):
 | |
|             msg = "mismatched input {} expecting set {!r}".format(
 | |
|                 self.getTokenErrorDisplay(e.token),
 | |
|                 e.expecting
 | |
|                 )
 | |
| 
 | |
|         elif isinstance(e, FailedPredicateException):
 | |
|             msg = "rule {} failed predicate: {{{}}}?".format(
 | |
|                 e.ruleName,
 | |
|                 e.predicateText
 | |
|                 )
 | |
| 
 | |
|         else:
 | |
|             msg = str(e)
 | |
| 
 | |
|         return msg
 | |
| 
 | |
| 
 | |
|     def getNumberOfSyntaxErrors(self):
 | |
|         """
 | |
|         Get number of recognition errors (lexer, parser, tree parser).  Each
 | |
|         recognizer tracks its own number.  So parser and lexer each have
 | |
|         separate count.  Does not count the spurious errors found between
 | |
|         an error and next valid token match.
 | |
| 
 | |
|         See also reportError().
 | |
|         """
 | |
|         return self._state.syntaxErrors
 | |
| 
 | |
| 
 | |
|     def getErrorHeader(self, e):
 | |
|         """
 | |
|         What is the error header, normally line/character position information?
 | |
|         """
 | |
| 
 | |
|         source_name = self.getSourceName()
 | |
|         if source_name is not None:
 | |
|             return "{} line {}:{}".format(source_name, e.line, e.charPositionInLine)
 | |
|         return "line {}:{}".format(e.line, e.charPositionInLine)
 | |
| 
 | |
| 
 | |
|     def getTokenErrorDisplay(self, t):
 | |
|         """
 | |
|         How should a token be displayed in an error message? The default
 | |
|         is to display just the text, but during development you might
 | |
|         want to have a lot of information spit out.  Override in that case
 | |
|         to use t.toString() (which, for CommonToken, dumps everything about
 | |
|         the token). This is better than forcing you to override a method in
 | |
|         your token objects because you don't have to go modify your lexer
 | |
|         so that it creates a new Java type.
 | |
|         """
 | |
| 
 | |
|         s = t.text
 | |
|         if s is None:
 | |
|             if t.type == EOF:
 | |
|                 s = "<EOF>"
 | |
|             else:
 | |
|                 s = "<{}>".format(t.typeName)
 | |
| 
 | |
|         return repr(s)
 | |
| 
 | |
| 
 | |
|     def emitErrorMessage(self, msg):
 | |
|         """Override this method to change where error messages go"""
 | |
|         sys.stderr.write(msg + '\n')
 | |
| 
 | |
| 
 | |
|     def recover(self, input, re):
 | |
|         """
 | |
|         Recover from an error found on the input stream.  This is
 | |
|         for NoViableAlt and mismatched symbol exceptions.  If you enable
 | |
|         single token insertion and deletion, this will usually not
 | |
|         handle mismatched symbol exceptions but there could be a mismatched
 | |
|         token that the match() routine could not recover from.
 | |
|         """
 | |
| 
 | |
|         # PROBLEM? what if input stream is not the same as last time
 | |
|         # perhaps make lastErrorIndex a member of input
 | |
|         if self._state.lastErrorIndex == input.index():
 | |
|             # uh oh, another error at same token index; must be a case
 | |
|             # where LT(1) is in the recovery token set so nothing is
 | |
|             # consumed; consume a single token so at least to prevent
 | |
|             # an infinite loop; this is a failsafe.
 | |
|             input.consume()
 | |
| 
 | |
|         self._state.lastErrorIndex = input.index()
 | |
|         followSet = self.computeErrorRecoverySet()
 | |
| 
 | |
|         self.beginResync()
 | |
|         self.consumeUntil(input, followSet)
 | |
|         self.endResync()
 | |
| 
 | |
| 
 | |
|     def beginResync(self):
 | |
|         """
 | |
|         A hook to listen in on the token consumption during error recovery.
 | |
|         The DebugParser subclasses this to fire events to the listenter.
 | |
|         """
 | |
| 
 | |
|         pass
 | |
| 
 | |
| 
 | |
|     def endResync(self):
 | |
|         """
 | |
|         A hook to listen in on the token consumption during error recovery.
 | |
|         The DebugParser subclasses this to fire events to the listenter.
 | |
|         """
 | |
| 
 | |
|         pass
 | |
| 
 | |
| 
 | |
|     def computeErrorRecoverySet(self):
 | |
|         """
 | |
|         Compute the error recovery set for the current rule.  During
 | |
|         rule invocation, the parser pushes the set of tokens that can
 | |
|         follow that rule reference on the stack; this amounts to
 | |
|         computing FIRST of what follows the rule reference in the
 | |
|         enclosing rule. This local follow set only includes tokens
 | |
|         from within the rule; i.e., the FIRST computation done by
 | |
|         ANTLR stops at the end of a rule.
 | |
| 
 | |
|         EXAMPLE
 | |
| 
 | |
|         When you find a "no viable alt exception", the input is not
 | |
|         consistent with any of the alternatives for rule r.  The best
 | |
|         thing to do is to consume tokens until you see something that
 | |
|         can legally follow a call to r *or* any rule that called r.
 | |
|         You don't want the exact set of viable next tokens because the
 | |
|         input might just be missing a token--you might consume the
 | |
|         rest of the input looking for one of the missing tokens.
 | |
| 
 | |
|         Consider grammar:
 | |
| 
 | |
|         a : '[' b ']'
 | |
|           | '(' b ')'
 | |
|           ;
 | |
|         b : c '^' INT ;
 | |
|         c : ID
 | |
|           | INT
 | |
|           ;
 | |
| 
 | |
|         At each rule invocation, the set of tokens that could follow
 | |
|         that rule is pushed on a stack.  Here are the various "local"
 | |
|         follow sets:
 | |
| 
 | |
|         FOLLOW(b1_in_a) = FIRST(']') = ']'
 | |
|         FOLLOW(b2_in_a) = FIRST(')') = ')'
 | |
|         FOLLOW(c_in_b) = FIRST('^') = '^'
 | |
| 
 | |
|         Upon erroneous input "[]", the call chain is
 | |
| 
 | |
|         a -> b -> c
 | |
| 
 | |
|         and, hence, the follow context stack is:
 | |
| 
 | |
|         depth  local follow set     after call to rule
 | |
|           0         \<EOF>                    a (from main())
 | |
|           1          ']'                     b
 | |
|           3          '^'                     c
 | |
| 
 | |
|         Notice that ')' is not included, because b would have to have
 | |
|         been called from a different context in rule a for ')' to be
 | |
|         included.
 | |
| 
 | |
|         For error recovery, we cannot consider FOLLOW(c)
 | |
|         (context-sensitive or otherwise).  We need the combined set of
 | |
|         all context-sensitive FOLLOW sets--the set of all tokens that
 | |
|         could follow any reference in the call chain.  We need to
 | |
|         resync to one of those tokens.  Note that FOLLOW(c)='^' and if
 | |
|         we resync'd to that token, we'd consume until EOF.  We need to
 | |
|         sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
 | |
|         In this case, for input "[]", LA(1) is in this set so we would
 | |
|         not consume anything and after printing an error rule c would
 | |
|         return normally.  It would not find the required '^' though.
 | |
|         At this point, it gets a mismatched token error and throws an
 | |
|         exception (since LA(1) is not in the viable following token
 | |
|         set).  The rule exception handler tries to recover, but finds
 | |
|         the same recovery set and doesn't consume anything.  Rule b
 | |
|         exits normally returning to rule a.  Now it finds the ']' (and
 | |
|         with the successful match exits errorRecovery mode).
 | |
| 
 | |
|         So, you cna see that the parser walks up call chain looking
 | |
|         for the token that was a member of the recovery set.
 | |
| 
 | |
|         Errors are not generated in errorRecovery mode.
 | |
| 
 | |
|         ANTLR's error recovery mechanism is based upon original ideas:
 | |
| 
 | |
|         "Algorithms + Data Structures = Programs" by Niklaus Wirth
 | |
| 
 | |
|         and
 | |
| 
 | |
|         "A note on error recovery in recursive descent parsers":
 | |
|         http://portal.acm.org/citation.cfm?id=947902.947905
 | |
| 
 | |
|         Later, Josef Grosch had some good ideas:
 | |
| 
 | |
|         "Efficient and Comfortable Error Recovery in Recursive Descent
 | |
|         Parsers":
 | |
|         ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
 | |
| 
 | |
|         Like Grosch I implemented local FOLLOW sets that are combined
 | |
|         at run-time upon error to avoid overhead during parsing.
 | |
|         """
 | |
| 
 | |
|         return self.combineFollows(False)
 | |
| 
 | |
| 
 | |
|     def computeContextSensitiveRuleFOLLOW(self):
 | |
|         """
 | |
|         Compute the context-sensitive FOLLOW set for current rule.
 | |
|         This is set of token types that can follow a specific rule
 | |
|         reference given a specific call chain.  You get the set of
 | |
|         viable tokens that can possibly come next (lookahead depth 1)
 | |
|         given the current call chain.  Contrast this with the
 | |
|         definition of plain FOLLOW for rule r:
 | |
| 
 | |
|          FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
 | |
| 
 | |
|         where x in T* and alpha, beta in V*; T is set of terminals and
 | |
|         V is the set of terminals and nonterminals.  In other words,
 | |
|         FOLLOW(r) is the set of all tokens that can possibly follow
 | |
|         references to r in *any* sentential form (context).  At
 | |
|         runtime, however, we know precisely which context applies as
 | |
|         we have the call chain.  We may compute the exact (rather
 | |
|         than covering superset) set of following tokens.
 | |
| 
 | |
|         For example, consider grammar:
 | |
| 
 | |
|         stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
 | |
|              | "return" expr '.'
 | |
|              ;
 | |
|         expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
 | |
|         atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
 | |
|              | '(' expr ')'
 | |
|              ;
 | |
| 
 | |
|         The FOLLOW sets are all inclusive whereas context-sensitive
 | |
|         FOLLOW sets are precisely what could follow a rule reference.
 | |
|         For input "i=(3);", here is the derivation:
 | |
| 
 | |
|         stat => ID '=' expr ';'
 | |
|              => ID '=' atom ('+' atom)* ';'
 | |
|              => ID '=' '(' expr ')' ('+' atom)* ';'
 | |
|              => ID '=' '(' atom ')' ('+' atom)* ';'
 | |
|              => ID '=' '(' INT ')' ('+' atom)* ';'
 | |
|              => ID '=' '(' INT ')' ';'
 | |
| 
 | |
|         At the "3" token, you'd have a call chain of
 | |
| 
 | |
|           stat -> expr -> atom -> expr -> atom
 | |
| 
 | |
|         What can follow that specific nested ref to atom?  Exactly ')'
 | |
|         as you can see by looking at the derivation of this specific
 | |
|         input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
 | |
| 
 | |
|         You want the exact viable token set when recovering from a
 | |
|         token mismatch.  Upon token mismatch, if LA(1) is member of
 | |
|         the viable next token set, then you know there is most likely
 | |
|         a missing token in the input stream.  "Insert" one by just not
 | |
|         throwing an exception.
 | |
|         """
 | |
| 
 | |
|         return self.combineFollows(True)
 | |
| 
 | |
| 
 | |
|     def combineFollows(self, exact):
 | |
|         followSet = set()
 | |
|         for idx, localFollowSet in reversed(list(enumerate(self._state.following))):
 | |
|             followSet |= localFollowSet
 | |
|             if exact:
 | |
|                 # can we see end of rule?
 | |
|                 if EOR_TOKEN_TYPE in localFollowSet:
 | |
|                     # Only leave EOR in set if at top (start rule); this lets
 | |
|                     # us know if have to include follow(start rule); i.e., EOF
 | |
|                     if idx > 0:
 | |
|                         followSet.remove(EOR_TOKEN_TYPE)
 | |
| 
 | |
|                 else:
 | |
|                     # can't see end of rule, quit
 | |
|                     break
 | |
| 
 | |
|         return followSet
 | |
| 
 | |
| 
 | |
|     def recoverFromMismatchedToken(self, input, ttype, follow):
 | |
|         """Attempt to recover from a single missing or extra token.
 | |
| 
 | |
|         EXTRA TOKEN
 | |
| 
 | |
|         LA(1) is not what we are looking for.  If LA(2) has the right token,
 | |
|         however, then assume LA(1) is some extra spurious token.  Delete it
 | |
|         and LA(2) as if we were doing a normal match(), which advances the
 | |
|         input.
 | |
| 
 | |
|         MISSING TOKEN
 | |
| 
 | |
|         If current token is consistent with what could come after
 | |
|         ttype then it is ok to 'insert' the missing token, else throw
 | |
|         exception For example, Input 'i=(3;' is clearly missing the
 | |
|         ')'.  When the parser returns from the nested call to expr, it
 | |
|         will have call chain:
 | |
| 
 | |
|           stat -> expr -> atom
 | |
| 
 | |
|         and it will be trying to match the ')' at this point in the
 | |
|         derivation:
 | |
| 
 | |
|              => ID '=' '(' INT ')' ('+' atom)* ';'
 | |
|                                 ^
 | |
|         match() will see that ';' doesn't match ')' and report a
 | |
|         mismatched token error.  To recover, it sees that LA(1)==';'
 | |
|         is in the set of tokens that can follow the ')' token
 | |
|         reference in rule atom.  It can assume that you forgot the ')'.
 | |
|         """
 | |
| 
 | |
|         e = None
 | |
| 
 | |
|         # if next token is what we are looking for then "delete" this token
 | |
|         if self.mismatchIsUnwantedToken(input, ttype):
 | |
|             e = UnwantedTokenException(ttype, input)
 | |
| 
 | |
|             self.beginResync()
 | |
|             input.consume() # simply delete extra token
 | |
|             self.endResync()
 | |
| 
 | |
|             # report after consuming so AW sees the token in the exception
 | |
|             self.reportError(e)
 | |
| 
 | |
|             # we want to return the token we're actually matching
 | |
|             matchedSymbol = self.getCurrentInputSymbol(input)
 | |
| 
 | |
|             # move past ttype token as if all were ok
 | |
|             input.consume()
 | |
|             return matchedSymbol
 | |
| 
 | |
|         # can't recover with single token deletion, try insertion
 | |
|         if self.mismatchIsMissingToken(input, follow):
 | |
|             inserted = self.getMissingSymbol(input, e, ttype, follow)
 | |
|             e = MissingTokenException(ttype, input, inserted)
 | |
| 
 | |
|             # report after inserting so AW sees the token in the exception
 | |
|             self.reportError(e)
 | |
|             return inserted
 | |
| 
 | |
|         # even that didn't work; must throw the exception
 | |
|         e = MismatchedTokenException(ttype, input)
 | |
|         raise e
 | |
| 
 | |
| 
 | |
|     def recoverFromMismatchedSet(self, input, e, follow):
 | |
|         """Not currently used"""
 | |
| 
 | |
|         if self.mismatchIsMissingToken(input, follow):
 | |
|             self.reportError(e)
 | |
|             # we don't know how to conjure up a token for sets yet
 | |
|             return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow)
 | |
| 
 | |
|         # TODO do single token deletion like above for Token mismatch
 | |
|         raise e
 | |
| 
 | |
| 
 | |
|     def getCurrentInputSymbol(self, input):
 | |
|         """
 | |
|         Match needs to return the current input symbol, which gets put
 | |
|         into the label for the associated token ref; e.g., x=ID.  Token
 | |
|         and tree parsers need to return different objects. Rather than test
 | |
|         for input stream type or change the IntStream interface, I use
 | |
|         a simple method to ask the recognizer to tell me what the current
 | |
|         input symbol is.
 | |
| 
 | |
|         This is ignored for lexers.
 | |
|         """
 | |
| 
 | |
|         return None
 | |
| 
 | |
| 
 | |
|     def getMissingSymbol(self, input, e, expectedTokenType, follow):
 | |
|         """Conjure up a missing token during error recovery.
 | |
| 
 | |
|         The recognizer attempts to recover from single missing
 | |
|         symbols. But, actions might refer to that missing symbol.
 | |
|         For example, x=ID {f($x);}. The action clearly assumes
 | |
|         that there has been an identifier matched previously and that
 | |
|         $x points at that token. If that token is missing, but
 | |
|         the next token in the stream is what we want we assume that
 | |
|         this token is missing and we keep going. Because we
 | |
|         have to return some token to replace the missing token,
 | |
|         we have to conjure one up. This method gives the user control
 | |
|         over the tokens returned for missing tokens. Mostly,
 | |
|         you will want to create something special for identifier
 | |
|         tokens. For literals such as '{' and ',', the default
 | |
|         action in the parser or tree parser works. It simply creates
 | |
|         a CommonToken of the appropriate type. The text will be the token.
 | |
|         If you change what tokens must be created by the lexer,
 | |
|         override this method to create the appropriate tokens.
 | |
|         """
 | |
| 
 | |
|         return None
 | |
| 
 | |
| 
 | |
|     def consumeUntil(self, input, tokenTypes):
 | |
|         """
 | |
|         Consume tokens until one matches the given token or token set
 | |
| 
 | |
|         tokenTypes can be a single token type or a set of token types
 | |
| 
 | |
|         """
 | |
| 
 | |
|         if not isinstance(tokenTypes, (set, frozenset)):
 | |
|             tokenTypes = frozenset([tokenTypes])
 | |
| 
 | |
|         ttype = input.LA(1)
 | |
|         while ttype != EOF and ttype not in tokenTypes:
 | |
|             input.consume()
 | |
|             ttype = input.LA(1)
 | |
| 
 | |
| 
 | |
|     def getRuleInvocationStack(self):
 | |
|         """
 | |
|         Return List<String> of the rules in your parser instance
 | |
|         leading up to a call to this method.  You could override if
 | |
|         you want more details such as the file/line info of where
 | |
|         in the parser java code a rule is invoked.
 | |
| 
 | |
|         This is very useful for error messages and for context-sensitive
 | |
|         error recovery.
 | |
| 
 | |
|         You must be careful, if you subclass a generated recognizers.
 | |
|         The default implementation will only search the module of self
 | |
|         for rules, but the subclass will not contain any rules.
 | |
|         You probably want to override this method to look like
 | |
| 
 | |
|         def getRuleInvocationStack(self):
 | |
|             return self._getRuleInvocationStack(<class>.__module__)
 | |
| 
 | |
|         where <class> is the class of the generated recognizer, e.g.
 | |
|         the superclass of self.
 | |
|         """
 | |
| 
 | |
|         return self._getRuleInvocationStack(self.__module__)
 | |
| 
 | |
| 
 | |
|     @classmethod
 | |
|     def _getRuleInvocationStack(cls, module):
 | |
|         """
 | |
|         A more general version of getRuleInvocationStack where you can
 | |
|         pass in, for example, a RecognitionException to get it's rule
 | |
|         stack trace.  This routine is shared with all recognizers, hence,
 | |
|         static.
 | |
| 
 | |
|         TODO: move to a utility class or something; weird having lexer call
 | |
|         this
 | |
|         """
 | |
| 
 | |
|         # mmmhhh,... perhaps look at the first argument
 | |
|         # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of
 | |
|         # requested recognizer...
 | |
| 
 | |
|         rules = []
 | |
|         for frame in reversed(inspect.stack()):
 | |
|             code = frame[0].f_code
 | |
|             codeMod = inspect.getmodule(code)
 | |
|             if codeMod is None:
 | |
|                 continue
 | |
| 
 | |
|             # skip frames not in requested module
 | |
|             if codeMod.__name__ != module:
 | |
|                 continue
 | |
| 
 | |
|             # skip some unwanted names
 | |
|             if code.co_name in ('nextToken', '<module>'):
 | |
|                 continue
 | |
| 
 | |
|             rules.append(code.co_name)
 | |
| 
 | |
|         return rules
 | |
| 
 | |
| 
 | |
|     def getBacktrackingLevel(self):
 | |
|         return self._state.backtracking
 | |
| 
 | |
|     def setBacktrackingLevel(self, n):
 | |
|         self._state.backtracking = n
 | |
| 
 | |
| 
 | |
|     def getGrammarFileName(self):
 | |
|         """For debugging and other purposes, might want the grammar name.
 | |
| 
 | |
|         Have ANTLR generate an implementation for this method.
 | |
|         """
 | |
| 
 | |
|         return self.grammarFileName
 | |
| 
 | |
| 
 | |
|     def getSourceName(self):
 | |
|         raise NotImplementedError
 | |
| 
 | |
| 
 | |
|     def toStrings(self, tokens):
 | |
|         """A convenience method for use most often with template rewrites.
 | |
| 
 | |
|         Convert a Token list to a str list.
 | |
|         """
 | |
| 
 | |
|         if tokens is None:
 | |
|             return None
 | |
| 
 | |
|         return [token.text for token in tokens]
 | |
| 
 | |
| 
 | |
|     def getRuleMemoization(self, ruleIndex, ruleStartIndex):
 | |
|         """
 | |
|         Given a rule number and a start token index number, return
 | |
|         MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
 | |
|         start index.  If this rule has parsed input starting from the
 | |
|         start index before, then return where the rule stopped parsing.
 | |
|         It returns the index of the last token matched by the rule.
 | |
|         """
 | |
| 
 | |
|         if ruleIndex not in self._state.ruleMemo:
 | |
|             self._state.ruleMemo[ruleIndex] = {}
 | |
| 
 | |
|         return self._state.ruleMemo[ruleIndex].get(
 | |
|             ruleStartIndex, self.MEMO_RULE_UNKNOWN
 | |
|             )
 | |
| 
 | |
| 
 | |
|     def alreadyParsedRule(self, input, ruleIndex):
 | |
|         """
 | |
|         Has this rule already parsed input at the current index in the
 | |
|         input stream?  Return the stop token index or MEMO_RULE_UNKNOWN.
 | |
|         If we attempted but failed to parse properly before, return
 | |
|         MEMO_RULE_FAILED.
 | |
| 
 | |
|         This method has a side-effect: if we have seen this input for
 | |
|         this rule and successfully parsed before, then seek ahead to
 | |
|         1 past the stop token matched for this rule last time.
 | |
|         """
 | |
| 
 | |
|         stopIndex = self.getRuleMemoization(ruleIndex, input.index())
 | |
|         if stopIndex == self.MEMO_RULE_UNKNOWN:
 | |
|             return False
 | |
| 
 | |
|         if stopIndex == self.MEMO_RULE_FAILED:
 | |
|             raise BacktrackingFailed
 | |
| 
 | |
|         else:
 | |
|             input.seek(stopIndex + 1)
 | |
| 
 | |
|         return True
 | |
| 
 | |
| 
 | |
|     def memoize(self, input, ruleIndex, ruleStartIndex, success):
 | |
|         """
 | |
|         Record whether or not this rule parsed the input at this position
 | |
|         successfully.
 | |
|         """
 | |
| 
 | |
|         if success:
 | |
|             stopTokenIndex = input.index() - 1
 | |
|         else:
 | |
|             stopTokenIndex = self.MEMO_RULE_FAILED
 | |
| 
 | |
|         if ruleIndex in self._state.ruleMemo:
 | |
|             self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex
 | |
| 
 | |
| 
 | |
|     def traceIn(self, ruleName, ruleIndex, inputSymbol):
 | |
|         sys.stdout.write("enter {} {}".format(ruleName, inputSymbol))
 | |
| 
 | |
|         if self._state.backtracking > 0:
 | |
|             sys.stdout.write(" backtracking={}".format(self._state.backtracking))
 | |
| 
 | |
|         sys.stdout.write('\n')
 | |
| 
 | |
| 
 | |
|     def traceOut(self, ruleName, ruleIndex, inputSymbol):
 | |
|         sys.stdout.write("exit {} {}".format(ruleName, inputSymbol))
 | |
| 
 | |
|         if self._state.backtracking > 0:
 | |
|             sys.stdout.write(" backtracking={}".format(self._state.backtracking))
 | |
| 
 | |
|         # mmmm... we use BacktrackingFailed exceptions now. So how could we
 | |
|         # get that information here?
 | |
|         #if self._state.failed:
 | |
|         #    sys.stdout.write(" failed")
 | |
|         #else:
 | |
|         #    sys.stdout.write(" succeeded")
 | |
| 
 | |
|         sys.stdout.write('\n')
 | |
| 
 | |
| 
 | |
| class TokenSource(object):
 | |
|     """
 | |
|     @brief Abstract baseclass for token producers.
 | |
| 
 | |
|     A source of tokens must provide a sequence of tokens via nextToken()
 | |
|     and also must reveal it's source of characters; CommonToken's text is
 | |
|     computed from a CharStream; it only store indices into the char stream.
 | |
| 
 | |
|     Errors from the lexer are never passed to the parser.  Either you want
 | |
|     to keep going or you do not upon token recognition error.  If you do not
 | |
|     want to continue lexing then you do not want to continue parsing.  Just
 | |
|     throw an exception not under RecognitionException and Java will naturally
 | |
|     toss you all the way out of the recognizers.  If you want to continue
 | |
|     lexing then you should not throw an exception to the parser--it has already
 | |
|     requested a token.  Keep lexing until you get a valid one.  Just report
 | |
|     errors and keep going, looking for a valid token.
 | |
|     """
 | |
| 
 | |
|     def nextToken(self):
 | |
|         """Return a Token object from your input stream (usually a CharStream).
 | |
| 
 | |
|         Do not fail/return upon lexing error; keep chewing on the characters
 | |
|         until you get a good one; errors are not passed through to the parser.
 | |
|         """
 | |
| 
 | |
|         raise NotImplementedError
 | |
| 
 | |
| 
 | |
|     def __iter__(self):
 | |
|         """The TokenSource is an interator.
 | |
| 
 | |
|         The iteration will not include the final EOF token, see also the note
 | |
|         for the __next__() method.
 | |
| 
 | |
|         """
 | |
| 
 | |
|         return self
 | |
| 
 | |
| 
 | |
|     def __next__(self):
 | |
|         """Return next token or raise StopIteration.
 | |
| 
 | |
|         Note that this will raise StopIteration when hitting the EOF token,
 | |
|         so EOF will not be part of the iteration.
 | |
| 
 | |
|         """
 | |
| 
 | |
|         token = self.nextToken()
 | |
|         if token is None or token.type == EOF:
 | |
|             raise StopIteration
 | |
|         return token
 | |
| 
 | |
| 
 | |
| class Lexer(BaseRecognizer, TokenSource):
 | |
|     """
 | |
|     @brief Baseclass for generated lexer classes.
 | |
| 
 | |
|     A lexer is recognizer that draws input symbols from a character stream.
 | |
|     lexer grammars result in a subclass of this object. A Lexer object
 | |
|     uses simplified match() and error recovery mechanisms in the interest
 | |
|     of speed.
 | |
|     """
 | |
| 
 | |
|     def __init__(self, input, state=None):
 | |
|         BaseRecognizer.__init__(self, state)
 | |
|         TokenSource.__init__(self)
 | |
| 
 | |
|         # Where is the lexer drawing characters from?
 | |
|         self.input = input
 | |
| 
 | |
| 
 | |
|     def reset(self):
 | |
|         super().reset() # reset all recognizer state variables
 | |
| 
 | |
|         if self.input is not None:
 | |
|             # rewind the input
 | |
|             self.input.seek(0)
 | |
| 
 | |
|         if self._state is None:
 | |
|             # no shared state work to do
 | |
|             return
 | |
| 
 | |
|         # wack Lexer state variables
 | |
|         self._state.token = None
 | |
|         self._state.type = INVALID_TOKEN_TYPE
 | |
|         self._state.channel = DEFAULT_CHANNEL
 | |
|         self._state.tokenStartCharIndex = -1
 | |
|         self._state.tokenStartLine = -1
 | |
|         self._state.tokenStartCharPositionInLine = -1
 | |
|         self._state.text = None
 | |
| 
 | |
| 
 | |
|     def makeEOFToken(self):
 | |
|         eof = CommonToken(
 | |
|             type=EOF, channel=DEFAULT_CHANNEL,
 | |
|             input=self.input,
 | |
|             start=self.input.index(), stop=self.input.index())
 | |
|         eof.line = self.input.line
 | |
|         eof.charPositionInLine = self.input.charPositionInLine
 | |
|         return eof
 | |
| 
 | |
|     def nextToken(self):
 | |
|         """
 | |
|         Return a token from this source; i.e., match a token on the char
 | |
|         stream.
 | |
|         """
 | |
| 
 | |
|         while 1:
 | |
|             self._state.token = None
 | |
|             self._state.channel = DEFAULT_CHANNEL
 | |
|             self._state.tokenStartCharIndex = self.input.index()
 | |
|             self._state.tokenStartCharPositionInLine = self.input.charPositionInLine
 | |
|             self._state.tokenStartLine = self.input.line
 | |
|             self._state.text = None
 | |
|             if self.input.LA(1) == EOF:
 | |
|                 return self.makeEOFToken()
 | |
| 
 | |
|             try:
 | |
|                 self.mTokens()
 | |
| 
 | |
|                 if self._state.token is None:
 | |
|                     self.emit()
 | |
| 
 | |
|                 elif self._state.token == SKIP_TOKEN:
 | |
|                     continue
 | |
| 
 | |
|                 return self._state.token
 | |
| 
 | |
|             except NoViableAltException as re:
 | |
|                 self.reportError(re)
 | |
|                 self.recover(re) # throw out current char and try again
 | |
| 
 | |
|             except RecognitionException as re:
 | |
|                 self.reportError(re)
 | |
|                 # match() routine has already called recover()
 | |
| 
 | |
| 
 | |
|     def skip(self):
 | |
|         """
 | |
|         Instruct the lexer to skip creating a token for current lexer rule
 | |
|         and look for another token.  nextToken() knows to keep looking when
 | |
|         a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
 | |
|         if token==null at end of any token rule, it creates one for you
 | |
|         and emits it.
 | |
|         """
 | |
| 
 | |
|         self._state.token = SKIP_TOKEN
 | |
| 
 | |
| 
 | |
|     def mTokens(self):
 | |
|         """This is the lexer entry point that sets instance var 'token'"""
 | |
| 
 | |
|         # abstract method
 | |
|         raise NotImplementedError
 | |
| 
 | |
| 
 | |
|     def setCharStream(self, input):
 | |
|         """Set the char stream and reset the lexer"""
 | |
|         self.input = None
 | |
|         self.reset()
 | |
|         self.input = input
 | |
| 
 | |
| 
 | |
|     def getSourceName(self):
 | |
|         return self.input.getSourceName()
 | |
| 
 | |
| 
 | |
|     def emit(self, token=None):
 | |
|         """
 | |
|         The standard method called to automatically emit a token at the
 | |
|         outermost lexical rule.  The token object should point into the
 | |
|         char buffer start..stop.  If there is a text override in 'text',
 | |
|         use that to set the token's text.  Override this method to emit
 | |
|         custom Token objects.
 | |
| 
 | |
|         If you are building trees, then you should also override
 | |
|         Parser or TreeParser.getMissingSymbol().
 | |
|         """
 | |
| 
 | |
|         if token is None:
 | |
|             token = CommonToken(
 | |
|                 input=self.input,
 | |
|                 type=self._state.type,
 | |
|                 channel=self._state.channel,
 | |
|                 start=self._state.tokenStartCharIndex,
 | |
|                 stop=self.getCharIndex()-1
 | |
|                 )
 | |
|             token.line = self._state.tokenStartLine
 | |
|             token.text = self._state.text
 | |
|             token.charPositionInLine = self._state.tokenStartCharPositionInLine
 | |
| 
 | |
|         self._state.token = token
 | |
| 
 | |
|         return token
 | |
| 
 | |
| 
 | |
|     def match(self, s):
 | |
|         if isinstance(s, str):
 | |
|             for c in s:
 | |
|                 if self.input.LA(1) != ord(c):
 | |
|                     if self._state.backtracking > 0:
 | |
|                         raise BacktrackingFailed
 | |
| 
 | |
|                     mte = MismatchedTokenException(c, self.input)
 | |
|                     self.recover(mte)
 | |
|                     raise mte
 | |
| 
 | |
|                 self.input.consume()
 | |
| 
 | |
|         else:
 | |
|             if self.input.LA(1) != s:
 | |
|                 if self._state.backtracking > 0:
 | |
|                     raise BacktrackingFailed
 | |
| 
 | |
|                 mte = MismatchedTokenException(chr(s), self.input)
 | |
|                 self.recover(mte) # don't really recover; just consume in lexer
 | |
|                 raise mte
 | |
| 
 | |
|             self.input.consume()
 | |
| 
 | |
| 
 | |
|     def matchAny(self):
 | |
|         self.input.consume()
 | |
| 
 | |
| 
 | |
|     def matchRange(self, a, b):
 | |
|         if self.input.LA(1) < a or self.input.LA(1) > b:
 | |
|             if self._state.backtracking > 0:
 | |
|                 raise BacktrackingFailed
 | |
| 
 | |
|             mre = MismatchedRangeException(chr(a), chr(b), self.input)
 | |
|             self.recover(mre)
 | |
|             raise mre
 | |
| 
 | |
|         self.input.consume()
 | |
| 
 | |
| 
 | |
|     def getLine(self):
 | |
|         return self.input.line
 | |
| 
 | |
| 
 | |
|     def getCharPositionInLine(self):
 | |
|         return self.input.charPositionInLine
 | |
| 
 | |
| 
 | |
|     def getCharIndex(self):
 | |
|         """What is the index of the current character of lookahead?"""
 | |
| 
 | |
|         return self.input.index()
 | |
| 
 | |
| 
 | |
|     def getText(self):
 | |
|         """
 | |
|         Return the text matched so far for the current token or any
 | |
|         text override.
 | |
|         """
 | |
|         if self._state.text is not None:
 | |
|             return self._state.text
 | |
| 
 | |
|         return self.input.substring(
 | |
|             self._state.tokenStartCharIndex,
 | |
|             self.getCharIndex()-1
 | |
|             )
 | |
| 
 | |
| 
 | |
|     def setText(self, text):
 | |
|         """
 | |
|         Set the complete text of this token; it wipes any previous
 | |
|         changes to the text.
 | |
|         """
 | |
|         self._state.text = text
 | |
| 
 | |
| 
 | |
|     text = property(getText, setText)
 | |
| 
 | |
| 
 | |
|     def reportError(self, e):
 | |
|         ## TODO: not thought about recovery in lexer yet.
 | |
| 
 | |
|         ## # if we've already reported an error and have not matched a token
 | |
|         ## # yet successfully, don't report any errors.
 | |
|         ## if self.errorRecovery:
 | |
|         ##     return
 | |
|         ##
 | |
|         ## self.errorRecovery = True
 | |
| 
 | |
|         self.displayRecognitionError(e)
 | |
| 
 | |
| 
 | |
|     def getErrorMessage(self, e):
 | |
|         msg = None
 | |
| 
 | |
|         if isinstance(e, MismatchedTokenException):
 | |
|             msg = "mismatched character {} expecting {}".format(
 | |
|                 self.getCharErrorDisplay(e.c),
 | |
|                 self.getCharErrorDisplay(e.expecting))
 | |
| 
 | |
|         elif isinstance(e, NoViableAltException):
 | |
|             msg = "no viable alternative at character {}".format(
 | |
|                 self.getCharErrorDisplay(e.c))
 | |
| 
 | |
|         elif isinstance(e, EarlyExitException):
 | |
|             msg = "required (...)+ loop did not match anything at character {}".format(
 | |
|                 self.getCharErrorDisplay(e.c))
 | |
| 
 | |
|         elif isinstance(e, MismatchedNotSetException):
 | |
|             msg = "mismatched character {} expecting set {!r}".format(
 | |
|                 self.getCharErrorDisplay(e.c),
 | |
|                 e.expecting)
 | |
| 
 | |
|         elif isinstance(e, MismatchedSetException):
 | |
|             msg = "mismatched character {} expecting set {!r}".format(
 | |
|                 self.getCharErrorDisplay(e.c),
 | |
|                 e.expecting)
 | |
| 
 | |
|         elif isinstance(e, MismatchedRangeException):
 | |
|             msg = "mismatched character {} expecting set {}..{}".format(
 | |
|                 self.getCharErrorDisplay(e.c),
 | |
|                 self.getCharErrorDisplay(e.a),
 | |
|                 self.getCharErrorDisplay(e.b))
 | |
| 
 | |
|         else:
 | |
|             msg = super().getErrorMessage(e)
 | |
| 
 | |
|         return msg
 | |
| 
 | |
| 
 | |
|     def getCharErrorDisplay(self, c):
 | |
|         if c == EOF:
 | |
|             c = '<EOF>'
 | |
|         return repr(c)
 | |
| 
 | |
| 
 | |
|     def recover(self, re):
 | |
|         """
 | |
|         Lexers can normally match any char in it's vocabulary after matching
 | |
|         a token, so do the easy thing and just kill a character and hope
 | |
|         it all works out.  You can instead use the rule invocation stack
 | |
|         to do sophisticated error recovery if you are in a fragment rule.
 | |
|         """
 | |
| 
 | |
|         self.input.consume()
 | |
| 
 | |
| 
 | |
|     def traceIn(self, ruleName, ruleIndex):
 | |
|         inputSymbol = "{} line={}:{}".format(self.input.LT(1),
 | |
|                                              self.getLine(),
 | |
|                                              self.getCharPositionInLine()
 | |
|                                              )
 | |
| 
 | |
|         super().traceIn(ruleName, ruleIndex, inputSymbol)
 | |
| 
 | |
| 
 | |
|     def traceOut(self, ruleName, ruleIndex):
 | |
|         inputSymbol = "{} line={}:{}".format(self.input.LT(1),
 | |
|                                              self.getLine(),
 | |
|                                              self.getCharPositionInLine()
 | |
|                                              )
 | |
| 
 | |
|         super().traceOut(ruleName, ruleIndex, inputSymbol)
 | |
| 
 | |
| 
 | |
| 
 | |
| class Parser(BaseRecognizer):
 | |
|     """
 | |
|     @brief Baseclass for generated parser classes.
 | |
|     """
 | |
| 
 | |
|     def __init__(self, lexer, state=None):
 | |
|         super().__init__(state)
 | |
| 
 | |
|         self.input = lexer
 | |
| 
 | |
| 
 | |
|     def reset(self):
 | |
|         super().reset() # reset all recognizer state variables
 | |
|         if self.input is not None:
 | |
|             self.input.seek(0) # rewind the input
 | |
| 
 | |
| 
 | |
|     def getCurrentInputSymbol(self, input):
 | |
|         return input.LT(1)
 | |
| 
 | |
| 
 | |
|     def getMissingSymbol(self, input, e, expectedTokenType, follow):
 | |
|         if expectedTokenType == EOF:
 | |
|             tokenText = "<missing EOF>"
 | |
|         else:
 | |
|             tokenText = "<missing {}>".format(self.tokenNames[expectedTokenType])
 | |
|         t = CommonToken(type=expectedTokenType, text=tokenText)
 | |
|         current = input.LT(1)
 | |
|         if current.type == EOF:
 | |
|             current = input.LT(-1)
 | |
| 
 | |
|         if current is not None:
 | |
|             t.line = current.line
 | |
|             t.charPositionInLine = current.charPositionInLine
 | |
|         t.channel = DEFAULT_CHANNEL
 | |
|         return t
 | |
| 
 | |
| 
 | |
|     def setTokenStream(self, input):
 | |
|         """Set the token stream and reset the parser"""
 | |
| 
 | |
|         self.input = None
 | |
|         self.reset()
 | |
|         self.input = input
 | |
| 
 | |
| 
 | |
|     def getTokenStream(self):
 | |
|         return self.input
 | |
| 
 | |
| 
 | |
|     def getSourceName(self):
 | |
|         return self.input.getSourceName()
 | |
| 
 | |
| 
 | |
|     def traceIn(self, ruleName, ruleIndex):
 | |
|         super().traceIn(ruleName, ruleIndex, self.input.LT(1))
 | |
| 
 | |
| 
 | |
|     def traceOut(self, ruleName, ruleIndex):
 | |
|         super().traceOut(ruleName, ruleIndex, self.input.LT(1))
 | |
| 
 | |
| 
 | |
| class RuleReturnScope(object):
 | |
|     """
 | |
|     Rules can return start/stop info as well as possible trees and templates.
 | |
|     """
 | |
| 
 | |
|     def getStart(self):
 | |
|         """Return the start token or tree."""
 | |
|         return None
 | |
| 
 | |
| 
 | |
|     def getStop(self):
 | |
|         """Return the stop token or tree."""
 | |
|         return None
 | |
| 
 | |
| 
 | |
|     def getTree(self):
 | |
|         """Has a value potentially if output=AST."""
 | |
|         return None
 | |
| 
 | |
| 
 | |
|     def getTemplate(self):
 | |
|         """Has a value potentially if output=template."""
 | |
|         return None
 | |
| 
 | |
| 
 | |
| class ParserRuleReturnScope(RuleReturnScope):
 | |
|     """
 | |
|     Rules that return more than a single value must return an object
 | |
|     containing all the values.  Besides the properties defined in
 | |
|     RuleLabelScope.predefinedRulePropertiesScope there may be user-defined
 | |
|     return values.  This class simply defines the minimum properties that
 | |
|     are always defined and methods to access the others that might be
 | |
|     available depending on output option such as template and tree.
 | |
| 
 | |
|     Note text is not an actual property of the return value, it is computed
 | |
|     from start and stop using the input stream's toString() method.  I
 | |
|     could add a ctor to this so that we can pass in and store the input
 | |
|     stream, but I'm not sure we want to do that.  It would seem to be undefined
 | |
|     to get the .text property anyway if the rule matches tokens from multiple
 | |
|     input streams.
 | |
| 
 | |
|     I do not use getters for fields of objects that are used simply to
 | |
|     group values such as this aggregate.  The getters/setters are there to
 | |
|     satisfy the superclass interface.
 | |
|     """
 | |
| 
 | |
|     def __init__(self):
 | |
|         super().__init__()
 | |
|         self.start = None
 | |
|         self.stop = None
 | |
|         self.tree = None  # only used when output=AST
 | |
| 
 | |
| 
 | |
|     def getStart(self):
 | |
|         return self.start
 | |
| 
 | |
| 
 | |
|     def getStop(self):
 | |
|         return self.stop
 | |
| 
 | |
| 
 | |
|     def getTree(self):
 | |
|         return self.tree
 |