1523 lines
44 KiB
Python
1523 lines
44 KiB
Python
"""ANTLR3 runtime package"""
|
|
|
|
# begin[licence]
|
|
#
|
|
# [The "BSD licence"]
|
|
# Copyright (c) 2005-2008 Terence Parr
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# 1. Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# 3. The name of the author may not be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# end[licence]
|
|
|
|
import codecs
|
|
from StringIO import StringIO
|
|
|
|
from antlr3.constants import DEFAULT_CHANNEL, EOF
|
|
from antlr3.tokens import Token, CommonToken
|
|
|
|
|
|
############################################################################
|
|
#
|
|
# basic interfaces
|
|
# IntStream
|
|
# +- CharStream
|
|
# \- TokenStream
|
|
#
|
|
# subclasses must implemented all methods
|
|
#
|
|
############################################################################
|
|
|
|
class IntStream(object):
|
|
"""
|
|
@brief Base interface for streams of integer values.
|
|
|
|
A simple stream of integers used when all I care about is the char
|
|
or token type sequence (such as interpretation).
|
|
"""
|
|
|
|
def consume(self):
|
|
raise NotImplementedError
|
|
|
|
|
|
def LA(self, i):
|
|
"""Get int at current input pointer + i ahead where i=1 is next int.
|
|
|
|
Negative indexes are allowed. LA(-1) is previous token (token
|
|
just matched). LA(-i) where i is before first token should
|
|
yield -1, invalid char / EOF.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def mark(self):
|
|
"""
|
|
Tell the stream to start buffering if it hasn't already. Return
|
|
current input position, index(), or some other marker so that
|
|
when passed to rewind() you get back to the same spot.
|
|
rewind(mark()) should not affect the input cursor. The Lexer
|
|
track line/col info as well as input index so its markers are
|
|
not pure input indexes. Same for tree node streams.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def index(self):
|
|
"""
|
|
Return the current input symbol index 0..n where n indicates the
|
|
last symbol has been read. The index is the symbol about to be
|
|
read not the most recently read symbol.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def rewind(self, marker=None):
|
|
"""
|
|
Reset the stream so that next call to index would return marker.
|
|
The marker will usually be index() but it doesn't have to be. It's
|
|
just a marker to indicate what state the stream was in. This is
|
|
essentially calling release() and seek(). If there are markers
|
|
created after this marker argument, this routine must unroll them
|
|
like a stack. Assume the state the stream was in when this marker
|
|
was created.
|
|
|
|
If marker is None:
|
|
Rewind to the input position of the last marker.
|
|
Used currently only after a cyclic DFA and just
|
|
before starting a sem/syn predicate to get the
|
|
input position back to the start of the decision.
|
|
Do not "pop" the marker off the state. mark(i)
|
|
and rewind(i) should balance still. It is
|
|
like invoking rewind(last marker) but it should not "pop"
|
|
the marker off. It's like seek(last marker's input position).
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def release(self, marker=None):
|
|
"""
|
|
You may want to commit to a backtrack but don't want to force the
|
|
stream to keep bookkeeping objects around for a marker that is
|
|
no longer necessary. This will have the same behavior as
|
|
rewind() except it releases resources without the backward seek.
|
|
This must throw away resources for all markers back to the marker
|
|
argument. So if you're nested 5 levels of mark(), and then release(2)
|
|
you have to release resources for depths 2..5.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def seek(self, index):
|
|
"""
|
|
Set the input cursor to the position indicated by index. This is
|
|
normally used to seek ahead in the input stream. No buffering is
|
|
required to do this unless you know your stream will use seek to
|
|
move backwards such as when backtracking.
|
|
|
|
This is different from rewind in its multi-directional
|
|
requirement and in that its argument is strictly an input cursor
|
|
(index).
|
|
|
|
For char streams, seeking forward must update the stream state such
|
|
as line number. For seeking backwards, you will be presumably
|
|
backtracking using the mark/rewind mechanism that restores state and
|
|
so this method does not need to update state when seeking backwards.
|
|
|
|
Currently, this method is only used for efficient backtracking using
|
|
memoization, but in the future it may be used for incremental parsing.
|
|
|
|
The index is 0..n-1. A seek to position i means that LA(1) will
|
|
return the ith symbol. So, seeking to 0 means LA(1) will return the
|
|
first element in the stream.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def size(self):
|
|
"""
|
|
Only makes sense for streams that buffer everything up probably, but
|
|
might be useful to display the entire stream or for testing. This
|
|
value includes a single EOF.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def getSourceName(self):
|
|
"""
|
|
Where are you getting symbols from? Normally, implementations will
|
|
pass the buck all the way to the lexer who can ask its input stream
|
|
for the file name or whatever.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
class CharStream(IntStream):
|
|
"""
|
|
@brief A source of characters for an ANTLR lexer.
|
|
|
|
This is an abstract class that must be implemented by a subclass.
|
|
|
|
"""
|
|
|
|
# pylint does not realize that this is an interface, too
|
|
#pylint: disable-msg=W0223
|
|
|
|
EOF = -1
|
|
|
|
|
|
def substring(self, start, stop):
|
|
"""
|
|
For infinite streams, you don't need this; primarily I'm providing
|
|
a useful interface for action code. Just make sure actions don't
|
|
use this on streams that don't support it.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def LT(self, i):
|
|
"""
|
|
Get the ith character of lookahead. This is the same usually as
|
|
LA(i). This will be used for labels in the generated
|
|
lexer code. I'd prefer to return a char here type-wise, but it's
|
|
probably better to be 32-bit clean and be consistent with LA.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def getLine(self):
|
|
"""ANTLR tracks the line information automatically"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def setLine(self, line):
|
|
"""
|
|
Because this stream can rewind, we need to be able to reset the line
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def getCharPositionInLine(self):
|
|
"""
|
|
The index of the character relative to the beginning of the line 0..n-1
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def setCharPositionInLine(self, pos):
|
|
raise NotImplementedError
|
|
|
|
|
|
class TokenStream(IntStream):
|
|
"""
|
|
|
|
@brief A stream of tokens accessing tokens from a TokenSource
|
|
|
|
This is an abstract class that must be implemented by a subclass.
|
|
|
|
"""
|
|
|
|
# pylint does not realize that this is an interface, too
|
|
#pylint: disable-msg=W0223
|
|
|
|
def LT(self, k):
|
|
"""
|
|
Get Token at current input pointer + i ahead where i=1 is next Token.
|
|
i<0 indicates tokens in the past. So -1 is previous token and -2 is
|
|
two tokens ago. LT(0) is undefined. For i>=n, return Token.EOFToken.
|
|
Return null for LT(0) and any index that results in an absolute address
|
|
that is negative.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def range(self):
|
|
"""
|
|
How far ahead has the stream been asked to look? The return
|
|
value is a valid index from 0..n-1.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def get(self, i):
|
|
"""
|
|
Get a token at an absolute index i; 0..n-1. This is really only
|
|
needed for profiling and debugging and token stream rewriting.
|
|
If you don't want to buffer up tokens, then this method makes no
|
|
sense for you. Naturally you can't use the rewrite stream feature.
|
|
I believe DebugTokenStream can easily be altered to not use
|
|
this method, removing the dependency.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def getTokenSource(self):
|
|
"""
|
|
Where is this stream pulling tokens from? This is not the name, but
|
|
the object that provides Token objects.
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
def toString(self, start=None, stop=None):
|
|
"""
|
|
Return the text of all tokens from start to stop, inclusive.
|
|
If the stream does not buffer all the tokens then it can just
|
|
return "" or null; Users should not access $ruleLabel.text in
|
|
an action of course in that case.
|
|
|
|
Because the user is not required to use a token with an index stored
|
|
in it, we must provide a means for two token objects themselves to
|
|
indicate the start/end location. Most often this will just delegate
|
|
to the other toString(int,int). This is also parallel with
|
|
the TreeNodeStream.toString(Object,Object).
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
############################################################################
|
|
#
|
|
# character streams for use in lexers
|
|
# CharStream
|
|
# \- ANTLRStringStream
|
|
#
|
|
############################################################################
|
|
|
|
|
|
class ANTLRStringStream(CharStream):
|
|
"""
|
|
@brief CharStream that pull data from a unicode string.
|
|
|
|
A pretty quick CharStream that pulls all data from an array
|
|
directly. Every method call counts in the lexer.
|
|
|
|
"""
|
|
|
|
|
|
def __init__(self, data):
|
|
"""
|
|
@param data This should be a unicode string holding the data you want
|
|
to parse. If you pass in a byte string, the Lexer will choke on
|
|
non-ascii data.
|
|
|
|
"""
|
|
|
|
CharStream.__init__(self)
|
|
|
|
# The data being scanned
|
|
self.strdata = unicode(data)
|
|
self.data = [ord(c) for c in self.strdata]
|
|
|
|
# How many characters are actually in the buffer
|
|
self.n = len(data)
|
|
|
|
# 0..n-1 index into string of next char
|
|
self.p = 0
|
|
|
|
# line number 1..n within the input
|
|
self.line = 1
|
|
|
|
# The index of the character relative to the beginning of the
|
|
# line 0..n-1
|
|
self.charPositionInLine = 0
|
|
|
|
# A list of CharStreamState objects that tracks the stream state
|
|
# values line, charPositionInLine, and p that can change as you
|
|
# move through the input stream. Indexed from 0..markDepth-1.
|
|
self._markers = [ ]
|
|
self.lastMarker = None
|
|
self.markDepth = 0
|
|
|
|
# What is name or source of this char stream?
|
|
self.name = None
|
|
|
|
|
|
def reset(self):
|
|
"""
|
|
Reset the stream so that it's in the same state it was
|
|
when the object was created *except* the data array is not
|
|
touched.
|
|
"""
|
|
|
|
self.p = 0
|
|
self.line = 1
|
|
self.charPositionInLine = 0
|
|
self._markers = [ ]
|
|
|
|
|
|
def consume(self):
|
|
try:
|
|
if self.data[self.p] == 10: # \n
|
|
self.line += 1
|
|
self.charPositionInLine = 0
|
|
else:
|
|
self.charPositionInLine += 1
|
|
|
|
self.p += 1
|
|
|
|
except IndexError:
|
|
# happend when we reached EOF and self.data[self.p] fails
|
|
# just do nothing
|
|
pass
|
|
|
|
|
|
|
|
def LA(self, i):
|
|
if i == 0:
|
|
return 0 # undefined
|
|
|
|
if i < 0:
|
|
i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1]
|
|
|
|
try:
|
|
return self.data[self.p+i-1]
|
|
except IndexError:
|
|
return EOF
|
|
|
|
|
|
|
|
def LT(self, i):
|
|
if i == 0:
|
|
return 0 # undefined
|
|
|
|
if i < 0:
|
|
i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1]
|
|
|
|
try:
|
|
return self.strdata[self.p+i-1]
|
|
except IndexError:
|
|
return EOF
|
|
|
|
|
|
def index(self):
|
|
"""
|
|
Return the current input symbol index 0..n where n indicates the
|
|
last symbol has been read. The index is the index of char to
|
|
be returned from LA(1).
|
|
"""
|
|
|
|
return self.p
|
|
|
|
|
|
def size(self):
|
|
return self.n
|
|
|
|
|
|
def mark(self):
|
|
state = (self.p, self.line, self.charPositionInLine)
|
|
try:
|
|
self._markers[self.markDepth] = state
|
|
except IndexError:
|
|
self._markers.append(state)
|
|
self.markDepth += 1
|
|
|
|
self.lastMarker = self.markDepth
|
|
|
|
return self.lastMarker
|
|
|
|
|
|
def rewind(self, marker=None):
|
|
if marker is None:
|
|
marker = self.lastMarker
|
|
|
|
p, line, charPositionInLine = self._markers[marker-1]
|
|
|
|
self.seek(p)
|
|
self.line = line
|
|
self.charPositionInLine = charPositionInLine
|
|
self.release(marker)
|
|
|
|
|
|
def release(self, marker=None):
|
|
if marker is None:
|
|
marker = self.lastMarker
|
|
|
|
self.markDepth = marker-1
|
|
|
|
|
|
def seek(self, index):
|
|
"""
|
|
consume() ahead until p==index; can't just set p=index as we must
|
|
update line and charPositionInLine.
|
|
"""
|
|
|
|
if index <= self.p:
|
|
self.p = index # just jump; don't update stream state (line, ...)
|
|
return
|
|
|
|
# seek forward, consume until p hits index
|
|
while self.p < index:
|
|
self.consume()
|
|
|
|
|
|
def substring(self, start, stop):
|
|
return self.strdata[start:stop+1]
|
|
|
|
|
|
def getLine(self):
|
|
"""Using setter/getter methods is deprecated. Use o.line instead."""
|
|
return self.line
|
|
|
|
|
|
def getCharPositionInLine(self):
|
|
"""
|
|
Using setter/getter methods is deprecated. Use o.charPositionInLine
|
|
instead.
|
|
"""
|
|
return self.charPositionInLine
|
|
|
|
|
|
def setLine(self, line):
|
|
"""Using setter/getter methods is deprecated. Use o.line instead."""
|
|
self.line = line
|
|
|
|
|
|
def setCharPositionInLine(self, pos):
|
|
"""
|
|
Using setter/getter methods is deprecated. Use o.charPositionInLine
|
|
instead.
|
|
"""
|
|
self.charPositionInLine = pos
|
|
|
|
|
|
def getSourceName(self):
|
|
return self.name
|
|
|
|
|
|
class ANTLRFileStream(ANTLRStringStream):
|
|
"""
|
|
@brief CharStream that opens a file to read the data.
|
|
|
|
This is a char buffer stream that is loaded from a file
|
|
all at once when you construct the object.
|
|
"""
|
|
|
|
def __init__(self, fileName, encoding=None):
|
|
"""
|
|
@param fileName The path to the file to be opened. The file will be
|
|
opened with mode 'rb'.
|
|
|
|
@param encoding If you set the optional encoding argument, then the
|
|
data will be decoded on the fly.
|
|
|
|
"""
|
|
|
|
self.fileName = fileName
|
|
|
|
fp = codecs.open(fileName, 'rb', encoding)
|
|
try:
|
|
data = fp.read()
|
|
finally:
|
|
fp.close()
|
|
|
|
ANTLRStringStream.__init__(self, data)
|
|
|
|
|
|
def getSourceName(self):
|
|
"""Deprecated, access o.fileName directly."""
|
|
|
|
return self.fileName
|
|
|
|
|
|
class ANTLRInputStream(ANTLRStringStream):
|
|
"""
|
|
@brief CharStream that reads data from a file-like object.
|
|
|
|
This is a char buffer stream that is loaded from a file like object
|
|
all at once when you construct the object.
|
|
|
|
All input is consumed from the file, but it is not closed.
|
|
"""
|
|
|
|
def __init__(self, file, encoding=None):
|
|
"""
|
|
@param file A file-like object holding your input. Only the read()
|
|
method must be implemented.
|
|
|
|
@param encoding If you set the optional encoding argument, then the
|
|
data will be decoded on the fly.
|
|
|
|
"""
|
|
|
|
if encoding is not None:
|
|
# wrap input in a decoding reader
|
|
reader = codecs.lookup(encoding)[2]
|
|
file = reader(file)
|
|
|
|
data = file.read()
|
|
|
|
ANTLRStringStream.__init__(self, data)
|
|
|
|
|
|
# I guess the ANTLR prefix exists only to avoid a name clash with some Java
|
|
# mumbojumbo. A plain "StringStream" looks better to me, which should be
|
|
# the preferred name in Python.
|
|
StringStream = ANTLRStringStream
|
|
FileStream = ANTLRFileStream
|
|
InputStream = ANTLRInputStream
|
|
|
|
|
|
############################################################################
|
|
#
|
|
# Token streams
|
|
# TokenStream
|
|
# +- CommonTokenStream
|
|
# \- TokenRewriteStream
|
|
#
|
|
############################################################################
|
|
|
|
|
|
class CommonTokenStream(TokenStream):
|
|
"""
|
|
@brief The most common stream of tokens
|
|
|
|
The most common stream of tokens is one where every token is buffered up
|
|
and tokens are prefiltered for a certain channel (the parser will only
|
|
see these tokens and cannot change the filter channel number during the
|
|
parse).
|
|
"""
|
|
|
|
def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
|
|
"""
|
|
@param tokenSource A TokenSource instance (usually a Lexer) to pull
|
|
the tokens from.
|
|
|
|
@param channel Skip tokens on any channel but this one; this is how we
|
|
skip whitespace...
|
|
|
|
"""
|
|
|
|
TokenStream.__init__(self)
|
|
|
|
self.tokenSource = tokenSource
|
|
|
|
# Record every single token pulled from the source so we can reproduce
|
|
# chunks of it later.
|
|
self.tokens = []
|
|
|
|
# Map<tokentype, channel> to override some Tokens' channel numbers
|
|
self.channelOverrideMap = {}
|
|
|
|
# Set<tokentype>; discard any tokens with this type
|
|
self.discardSet = set()
|
|
|
|
# Skip tokens on any channel but this one; this is how we skip
|
|
# whitespace...
|
|
self.channel = channel
|
|
|
|
# By default, track all incoming tokens
|
|
self.discardOffChannelTokens = False
|
|
|
|
# The index into the tokens list of the current token (next token
|
|
# to consume). p==-1 indicates that the tokens list is empty
|
|
self.p = -1
|
|
|
|
# Remember last marked position
|
|
self.lastMarker = None
|
|
|
|
# how deep have we gone?
|
|
self._range = -1
|
|
|
|
|
|
def makeEOFToken(self):
|
|
return self.tokenSource.makeEOFToken()
|
|
|
|
|
|
def setTokenSource(self, tokenSource):
|
|
"""Reset this token stream by setting its token source."""
|
|
|
|
self.tokenSource = tokenSource
|
|
self.tokens = []
|
|
self.p = -1
|
|
self.channel = DEFAULT_CHANNEL
|
|
|
|
|
|
def reset(self):
|
|
self.p = 0
|
|
self.lastMarker = None
|
|
|
|
|
|
def fillBuffer(self):
|
|
"""
|
|
Load all tokens from the token source and put in tokens.
|
|
This is done upon first LT request because you might want to
|
|
set some token type / channel overrides before filling buffer.
|
|
"""
|
|
|
|
|
|
index = 0
|
|
t = self.tokenSource.nextToken()
|
|
while t is not None and t.type != EOF:
|
|
discard = False
|
|
|
|
if self.discardSet is not None and t.type in self.discardSet:
|
|
discard = True
|
|
|
|
elif self.discardOffChannelTokens and t.channel != self.channel:
|
|
discard = True
|
|
|
|
# is there a channel override for token type?
|
|
try:
|
|
overrideChannel = self.channelOverrideMap[t.type]
|
|
|
|
except KeyError:
|
|
# no override for this type
|
|
pass
|
|
|
|
else:
|
|
if overrideChannel == self.channel:
|
|
t.channel = overrideChannel
|
|
else:
|
|
discard = True
|
|
|
|
if not discard:
|
|
t.index = index
|
|
self.tokens.append(t)
|
|
index += 1
|
|
|
|
t = self.tokenSource.nextToken()
|
|
|
|
# leave p pointing at first token on channel
|
|
self.p = 0
|
|
self.p = self.skipOffTokenChannels(self.p)
|
|
|
|
|
|
def consume(self):
|
|
"""
|
|
Move the input pointer to the next incoming token. The stream
|
|
must become active with LT(1) available. consume() simply
|
|
moves the input pointer so that LT(1) points at the next
|
|
input symbol. Consume at least one token.
|
|
|
|
Walk past any token not on the channel the parser is listening to.
|
|
"""
|
|
|
|
if self.p < len(self.tokens):
|
|
self.p += 1
|
|
|
|
self.p = self.skipOffTokenChannels(self.p) # leave p on valid token
|
|
|
|
|
|
def skipOffTokenChannels(self, i):
|
|
"""
|
|
Given a starting index, return the index of the first on-channel
|
|
token.
|
|
"""
|
|
|
|
try:
|
|
while self.tokens[i].channel != self.channel:
|
|
i += 1
|
|
except IndexError:
|
|
# hit the end of token stream
|
|
pass
|
|
|
|
return i
|
|
|
|
|
|
def skipOffTokenChannelsReverse(self, i):
|
|
while i >= 0 and self.tokens[i].channel != self.channel:
|
|
i -= 1
|
|
|
|
return i
|
|
|
|
|
|
def setTokenTypeChannel(self, ttype, channel):
|
|
"""
|
|
A simple filter mechanism whereby you can tell this token stream
|
|
to force all tokens of type ttype to be on channel. For example,
|
|
when interpreting, we cannot exec actions so we need to tell
|
|
the stream to force all WS and NEWLINE to be a different, ignored
|
|
channel.
|
|
"""
|
|
|
|
self.channelOverrideMap[ttype] = channel
|
|
|
|
|
|
def discardTokenType(self, ttype):
|
|
self.discardSet.add(ttype)
|
|
|
|
|
|
def getTokens(self, start=None, stop=None, types=None):
|
|
"""
|
|
Given a start and stop index, return a list of all tokens in
|
|
the token type set. Return None if no tokens were found. This
|
|
method looks at both on and off channel tokens.
|
|
"""
|
|
|
|
if self.p == -1:
|
|
self.fillBuffer()
|
|
|
|
if stop is None or stop > len(self.tokens):
|
|
stop = len(self.tokens)
|
|
|
|
if start is None or stop < 0:
|
|
start = 0
|
|
|
|
if start > stop:
|
|
return None
|
|
|
|
if isinstance(types, (int, long)):
|
|
# called with a single type, wrap into set
|
|
types = set([types])
|
|
|
|
filteredTokens = [
|
|
token for token in self.tokens[start:stop]
|
|
if types is None or token.type in types
|
|
]
|
|
|
|
if len(filteredTokens) == 0:
|
|
return None
|
|
|
|
return filteredTokens
|
|
|
|
|
|
def LT(self, k):
|
|
"""
|
|
Get the ith token from the current position 1..n where k=1 is the
|
|
first symbol of lookahead.
|
|
"""
|
|
|
|
if self.p == -1:
|
|
self.fillBuffer()
|
|
|
|
if k == 0:
|
|
return None
|
|
|
|
if k < 0:
|
|
return self.LB(-k)
|
|
|
|
i = self.p
|
|
n = 1
|
|
# find k good tokens
|
|
while n < k:
|
|
# skip off-channel tokens
|
|
i = self.skipOffTokenChannels(i+1) # leave p on valid token
|
|
n += 1
|
|
|
|
if i > self._range:
|
|
self._range = i
|
|
|
|
try:
|
|
return self.tokens[i]
|
|
except IndexError:
|
|
return self.makeEOFToken()
|
|
|
|
|
|
def LB(self, k):
|
|
"""Look backwards k tokens on-channel tokens"""
|
|
|
|
if self.p == -1:
|
|
self.fillBuffer()
|
|
|
|
if k == 0:
|
|
return None
|
|
|
|
if self.p - k < 0:
|
|
return None
|
|
|
|
i = self.p
|
|
n = 1
|
|
# find k good tokens looking backwards
|
|
while n <= k:
|
|
# skip off-channel tokens
|
|
i = self.skipOffTokenChannelsReverse(i-1) # leave p on valid token
|
|
n += 1
|
|
|
|
if i < 0:
|
|
return None
|
|
|
|
return self.tokens[i]
|
|
|
|
|
|
def get(self, i):
|
|
"""
|
|
Return absolute token i; ignore which channel the tokens are on;
|
|
that is, count all tokens not just on-channel tokens.
|
|
"""
|
|
|
|
return self.tokens[i]
|
|
|
|
|
|
def slice(self, start, stop):
|
|
if self.p == -1:
|
|
self.fillBuffer()
|
|
|
|
if start < 0 or stop < 0:
|
|
return None
|
|
|
|
return self.tokens[start:stop+1]
|
|
|
|
|
|
def LA(self, i):
|
|
return self.LT(i).type
|
|
|
|
|
|
def mark(self):
|
|
self.lastMarker = self.index()
|
|
return self.lastMarker
|
|
|
|
|
|
def release(self, marker=None):
|
|
# no resources to release
|
|
pass
|
|
|
|
|
|
def size(self):
|
|
return len(self.tokens)
|
|
|
|
|
|
def range(self):
|
|
return self._range
|
|
|
|
|
|
def index(self):
|
|
return self.p
|
|
|
|
|
|
def rewind(self, marker=None):
|
|
if marker is None:
|
|
marker = self.lastMarker
|
|
|
|
self.seek(marker)
|
|
|
|
|
|
def seek(self, index):
|
|
self.p = index
|
|
|
|
|
|
def getTokenSource(self):
|
|
return self.tokenSource
|
|
|
|
|
|
def getSourceName(self):
|
|
return self.tokenSource.getSourceName()
|
|
|
|
|
|
def toString(self, start=None, stop=None):
|
|
if self.p == -1:
|
|
self.fillBuffer()
|
|
|
|
if start is None:
|
|
start = 0
|
|
elif not isinstance(start, int):
|
|
start = start.index
|
|
|
|
if stop is None:
|
|
stop = len(self.tokens) - 1
|
|
elif not isinstance(stop, int):
|
|
stop = stop.index
|
|
|
|
if stop >= len(self.tokens):
|
|
stop = len(self.tokens) - 1
|
|
|
|
return ''.join([t.text for t in self.tokens[start:stop+1]])
|
|
|
|
|
|
class RewriteOperation(object):
|
|
"""@brief Internal helper class."""
|
|
|
|
def __init__(self, stream, index, text):
|
|
self.stream = stream
|
|
|
|
# What index into rewrites List are we?
|
|
self.instructionIndex = None
|
|
|
|
# Token buffer index.
|
|
self.index = index
|
|
self.text = text
|
|
|
|
def execute(self, buf):
|
|
"""Execute the rewrite operation by possibly adding to the buffer.
|
|
Return the index of the next token to operate on.
|
|
"""
|
|
|
|
return self.index
|
|
|
|
def toString(self):
|
|
opName = self.__class__.__name__
|
|
return '<%s@%d:"%s">' % (
|
|
opName, self.index, self.text)
|
|
|
|
__str__ = toString
|
|
__repr__ = toString
|
|
|
|
|
|
class InsertBeforeOp(RewriteOperation):
|
|
"""@brief Internal helper class."""
|
|
|
|
def execute(self, buf):
|
|
buf.write(self.text)
|
|
if self.stream.tokens[self.index].type != EOF:
|
|
buf.write(self.stream.tokens[self.index].text)
|
|
return self.index + 1
|
|
|
|
|
|
class ReplaceOp(RewriteOperation):
|
|
"""
|
|
@brief Internal helper class.
|
|
|
|
I'm going to try replacing range from x..y with (y-x)+1 ReplaceOp
|
|
instructions.
|
|
"""
|
|
|
|
def __init__(self, stream, first, last, text):
|
|
RewriteOperation.__init__(self, stream, first, text)
|
|
self.lastIndex = last
|
|
|
|
|
|
def execute(self, buf):
|
|
if self.text is not None:
|
|
buf.write(self.text)
|
|
|
|
return self.lastIndex + 1
|
|
|
|
|
|
def toString(self):
|
|
if self.text is None:
|
|
return '<DeleteOp@%d..%d>' % (self.index, self.lastIndex)
|
|
|
|
return '<ReplaceOp@%d..%d:"%s">' % (
|
|
self.index, self.lastIndex, self.text)
|
|
|
|
__str__ = toString
|
|
__repr__ = toString
|
|
|
|
|
|
class TokenRewriteStream(CommonTokenStream):
|
|
"""@brief CommonTokenStream that can be modified.
|
|
|
|
Useful for dumping out the input stream after doing some
|
|
augmentation or other manipulations.
|
|
|
|
You can insert stuff, replace, and delete chunks. Note that the
|
|
operations are done lazily--only if you convert the buffer to a
|
|
String. This is very efficient because you are not moving data around
|
|
all the time. As the buffer of tokens is converted to strings, the
|
|
toString() method(s) check to see if there is an operation at the
|
|
current index. If so, the operation is done and then normal String
|
|
rendering continues on the buffer. This is like having multiple Turing
|
|
machine instruction streams (programs) operating on a single input tape. :)
|
|
|
|
Since the operations are done lazily at toString-time, operations do not
|
|
screw up the token index values. That is, an insert operation at token
|
|
index i does not change the index values for tokens i+1..n-1.
|
|
|
|
Because operations never actually alter the buffer, you may always get
|
|
the original token stream back without undoing anything. Since
|
|
the instructions are queued up, you can easily simulate transactions and
|
|
roll back any changes if there is an error just by removing instructions.
|
|
For example,
|
|
|
|
CharStream input = new ANTLRFileStream("input");
|
|
TLexer lex = new TLexer(input);
|
|
TokenRewriteStream tokens = new TokenRewriteStream(lex);
|
|
T parser = new T(tokens);
|
|
parser.startRule();
|
|
|
|
Then in the rules, you can execute
|
|
Token t,u;
|
|
...
|
|
input.insertAfter(t, "text to put after t");}
|
|
input.insertAfter(u, "text after u");}
|
|
System.out.println(tokens.toString());
|
|
|
|
Actually, you have to cast the 'input' to a TokenRewriteStream. :(
|
|
|
|
You can also have multiple "instruction streams" and get multiple
|
|
rewrites from a single pass over the input. Just name the instruction
|
|
streams and use that name again when printing the buffer. This could be
|
|
useful for generating a C file and also its header file--all from the
|
|
same buffer:
|
|
|
|
tokens.insertAfter("pass1", t, "text to put after t");}
|
|
tokens.insertAfter("pass2", u, "text after u");}
|
|
System.out.println(tokens.toString("pass1"));
|
|
System.out.println(tokens.toString("pass2"));
|
|
|
|
If you don't use named rewrite streams, a "default" stream is used as
|
|
the first example shows.
|
|
"""
|
|
|
|
DEFAULT_PROGRAM_NAME = "default"
|
|
MIN_TOKEN_INDEX = 0
|
|
|
|
def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
|
|
CommonTokenStream.__init__(self, tokenSource, channel)
|
|
|
|
# You may have multiple, named streams of rewrite operations.
|
|
# I'm calling these things "programs."
|
|
# Maps String (name) -> rewrite (List)
|
|
self.programs = {}
|
|
self.programs[self.DEFAULT_PROGRAM_NAME] = []
|
|
|
|
# Map String (program name) -> Integer index
|
|
self.lastRewriteTokenIndexes = {}
|
|
|
|
|
|
def rollback(self, *args):
|
|
"""
|
|
Rollback the instruction stream for a program so that
|
|
the indicated instruction (via instructionIndex) is no
|
|
longer in the stream. UNTESTED!
|
|
"""
|
|
|
|
if len(args) == 2:
|
|
programName = args[0]
|
|
instructionIndex = args[1]
|
|
elif len(args) == 1:
|
|
programName = self.DEFAULT_PROGRAM_NAME
|
|
instructionIndex = args[0]
|
|
else:
|
|
raise TypeError("Invalid arguments")
|
|
|
|
p = self.programs.get(programName, None)
|
|
if p is not None:
|
|
self.programs[programName] = (
|
|
p[self.MIN_TOKEN_INDEX:instructionIndex])
|
|
|
|
|
|
def deleteProgram(self, programName=DEFAULT_PROGRAM_NAME):
|
|
"""Reset the program so that no instructions exist"""
|
|
|
|
self.rollback(programName, self.MIN_TOKEN_INDEX)
|
|
|
|
|
|
def insertAfter(self, *args):
|
|
if len(args) == 2:
|
|
programName = self.DEFAULT_PROGRAM_NAME
|
|
index = args[0]
|
|
text = args[1]
|
|
|
|
elif len(args) == 3:
|
|
programName = args[0]
|
|
index = args[1]
|
|
text = args[2]
|
|
|
|
else:
|
|
raise TypeError("Invalid arguments")
|
|
|
|
if isinstance(index, Token):
|
|
# index is a Token, grap the stream index from it
|
|
index = index.index
|
|
|
|
# to insert after, just insert before next index (even if past end)
|
|
self.insertBefore(programName, index+1, text)
|
|
|
|
|
|
def insertBefore(self, *args):
|
|
if len(args) == 2:
|
|
programName = self.DEFAULT_PROGRAM_NAME
|
|
index = args[0]
|
|
text = args[1]
|
|
|
|
elif len(args) == 3:
|
|
programName = args[0]
|
|
index = args[1]
|
|
text = args[2]
|
|
|
|
else:
|
|
raise TypeError("Invalid arguments")
|
|
|
|
if isinstance(index, Token):
|
|
# index is a Token, grap the stream index from it
|
|
index = index.index
|
|
|
|
op = InsertBeforeOp(self, index, text)
|
|
rewrites = self.getProgram(programName)
|
|
op.instructionIndex = len(rewrites)
|
|
rewrites.append(op)
|
|
|
|
|
|
def replace(self, *args):
|
|
if len(args) == 2:
|
|
programName = self.DEFAULT_PROGRAM_NAME
|
|
first = args[0]
|
|
last = args[0]
|
|
text = args[1]
|
|
|
|
elif len(args) == 3:
|
|
programName = self.DEFAULT_PROGRAM_NAME
|
|
first = args[0]
|
|
last = args[1]
|
|
text = args[2]
|
|
|
|
elif len(args) == 4:
|
|
programName = args[0]
|
|
first = args[1]
|
|
last = args[2]
|
|
text = args[3]
|
|
|
|
else:
|
|
raise TypeError("Invalid arguments")
|
|
|
|
if isinstance(first, Token):
|
|
# first is a Token, grap the stream index from it
|
|
first = first.index
|
|
|
|
if isinstance(last, Token):
|
|
# last is a Token, grap the stream index from it
|
|
last = last.index
|
|
|
|
if first > last or first < 0 or last < 0 or last >= len(self.tokens):
|
|
raise ValueError(
|
|
"replace: range invalid: %d..%d (size=%d)"
|
|
% (first, last, len(self.tokens)))
|
|
|
|
op = ReplaceOp(self, first, last, text)
|
|
rewrites = self.getProgram(programName)
|
|
op.instructionIndex = len(rewrites)
|
|
rewrites.append(op)
|
|
|
|
|
|
def delete(self, *args):
|
|
self.replace(*(list(args) + [None]))
|
|
|
|
|
|
def getLastRewriteTokenIndex(self, programName=DEFAULT_PROGRAM_NAME):
|
|
return self.lastRewriteTokenIndexes.get(programName, -1)
|
|
|
|
|
|
def setLastRewriteTokenIndex(self, programName, i):
|
|
self.lastRewriteTokenIndexes[programName] = i
|
|
|
|
|
|
def getProgram(self, name):
|
|
p = self.programs.get(name, None)
|
|
if p is None:
|
|
p = self.initializeProgram(name)
|
|
|
|
return p
|
|
|
|
|
|
def initializeProgram(self, name):
|
|
p = []
|
|
self.programs[name] = p
|
|
return p
|
|
|
|
|
|
def toOriginalString(self, start=None, end=None):
|
|
if self.p == -1:
|
|
self.fillBuffer()
|
|
|
|
if start is None:
|
|
start = self.MIN_TOKEN_INDEX
|
|
if end is None:
|
|
end = self.size() - 1
|
|
|
|
buf = StringIO()
|
|
i = start
|
|
while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
|
|
if self.get(i).type != EOF:
|
|
buf.write(self.get(i).text)
|
|
i += 1
|
|
|
|
return buf.getvalue()
|
|
|
|
|
|
def toString(self, *args):
|
|
if self.p == -1:
|
|
self.fillBuffer()
|
|
|
|
if len(args) == 0:
|
|
programName = self.DEFAULT_PROGRAM_NAME
|
|
start = self.MIN_TOKEN_INDEX
|
|
end = self.size() - 1
|
|
|
|
elif len(args) == 1:
|
|
programName = args[0]
|
|
start = self.MIN_TOKEN_INDEX
|
|
end = self.size() - 1
|
|
|
|
elif len(args) == 2:
|
|
programName = self.DEFAULT_PROGRAM_NAME
|
|
start = args[0]
|
|
end = args[1]
|
|
|
|
if start is None:
|
|
start = self.MIN_TOKEN_INDEX
|
|
elif not isinstance(start, int):
|
|
start = start.index
|
|
|
|
if end is None:
|
|
end = len(self.tokens) - 1
|
|
elif not isinstance(end, int):
|
|
end = end.index
|
|
|
|
# ensure start/end are in range
|
|
if end >= len(self.tokens):
|
|
end = len(self.tokens) - 1
|
|
|
|
if start < 0:
|
|
start = 0
|
|
|
|
rewrites = self.programs.get(programName)
|
|
if rewrites is None or len(rewrites) == 0:
|
|
# no instructions to execute
|
|
return self.toOriginalString(start, end)
|
|
|
|
buf = StringIO()
|
|
|
|
# First, optimize instruction stream
|
|
indexToOp = self.reduceToSingleOperationPerIndex(rewrites)
|
|
|
|
# Walk buffer, executing instructions and emitting tokens
|
|
i = start
|
|
while i <= end and i < len(self.tokens):
|
|
op = indexToOp.get(i)
|
|
# remove so any left have index size-1
|
|
try:
|
|
del indexToOp[i]
|
|
except KeyError:
|
|
pass
|
|
|
|
t = self.tokens[i]
|
|
if op is None:
|
|
# no operation at that index, just dump token
|
|
if t.type != EOF:
|
|
buf.write(t.text)
|
|
i += 1 # move to next token
|
|
|
|
else:
|
|
i = op.execute(buf) # execute operation and skip
|
|
|
|
# include stuff after end if it's last index in buffer
|
|
# So, if they did an insertAfter(lastValidIndex, "foo"), include
|
|
# foo if end==lastValidIndex.
|
|
if end == len(self.tokens) - 1:
|
|
# Scan any remaining operations after last token
|
|
# should be included (they will be inserts).
|
|
for i in sorted(indexToOp.keys()):
|
|
op = indexToOp[i]
|
|
if op.index >= len(self.tokens)-1:
|
|
buf.write(op.text)
|
|
|
|
return buf.getvalue()
|
|
|
|
__str__ = toString
|
|
|
|
|
|
def reduceToSingleOperationPerIndex(self, rewrites):
|
|
"""
|
|
We need to combine operations and report invalid operations (like
|
|
overlapping replaces that are not completed nested). Inserts to
|
|
same index need to be combined etc... Here are the cases:
|
|
|
|
I.i.u I.j.v leave alone, nonoverlapping
|
|
I.i.u I.i.v combine: Iivu
|
|
|
|
R.i-j.u R.x-y.v | i-j in x-y delete first R
|
|
R.i-j.u R.i-j.v delete first R
|
|
R.i-j.u R.x-y.v | x-y in i-j ERROR
|
|
R.i-j.u R.x-y.v | boundaries overlap ERROR
|
|
|
|
Delete special case of replace (text==null):
|
|
D.i-j.u D.x-y.v | boundaries overlapcombine to
|
|
max(min)..max(right)
|
|
|
|
I.i.u R.x-y.v | i in (x+1)-ydelete I (since
|
|
insert before we're not deleting
|
|
i)
|
|
I.i.u R.x-y.v | i not in (x+1)-yleave alone,
|
|
nonoverlapping
|
|
|
|
R.x-y.v I.i.u | i in x-y ERROR
|
|
R.x-y.v I.x.u R.x-y.uv (combine, delete I)
|
|
R.x-y.v I.i.u | i not in x-y leave alone, nonoverlapping
|
|
|
|
I.i.u = insert u before op @ index i
|
|
R.x-y.u = replace x-y indexed tokens with u
|
|
|
|
First we need to examine replaces. For any replace op:
|
|
|
|
1. wipe out any insertions before op within that range.
|
|
2. Drop any replace op before that is contained completely within
|
|
that range.
|
|
3. Throw exception upon boundary overlap with any previous replace.
|
|
|
|
Then we can deal with inserts:
|
|
|
|
1. for any inserts to same index, combine even if not adjacent.
|
|
2. for any prior replace with same left boundary, combine this
|
|
insert with replace and delete this replace.
|
|
3. throw exception if index in same range as previous replace
|
|
|
|
Don't actually delete; make op null in list. Easier to walk list.
|
|
Later we can throw as we add to index -> op map.
|
|
|
|
Note that I.2 R.2-2 will wipe out I.2 even though, technically, the
|
|
inserted stuff would be before the replace range. But, if you
|
|
add tokens in front of a method body '{' and then delete the method
|
|
body, I think the stuff before the '{' you added should disappear too.
|
|
|
|
Return a map from token index to operation.
|
|
"""
|
|
|
|
# WALK REPLACES
|
|
for i, rop in enumerate(rewrites):
|
|
if rop is None:
|
|
continue
|
|
|
|
if not isinstance(rop, ReplaceOp):
|
|
continue
|
|
|
|
# Wipe prior inserts within range
|
|
for j, iop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
|
|
if iop.index == rop.index:
|
|
# E.g., insert before 2, delete 2..2; update replace
|
|
# text to include insert before, kill insert
|
|
rewrites[iop.instructionIndex] = None
|
|
rop.text = self.catOpText(iop.text, rop.text)
|
|
|
|
elif iop.index > rop.index and iop.index <= rop.lastIndex:
|
|
# delete insert as it's a no-op.
|
|
rewrites[j] = None
|
|
|
|
# Drop any prior replaces contained within
|
|
for j, prevRop in self.getKindOfOps(rewrites, ReplaceOp, i):
|
|
if (prevRop.index >= rop.index
|
|
and prevRop.lastIndex <= rop.lastIndex):
|
|
# delete replace as it's a no-op.
|
|
rewrites[j] = None
|
|
continue
|
|
|
|
# throw exception unless disjoint or identical
|
|
disjoint = (prevRop.lastIndex < rop.index
|
|
or prevRop.index > rop.lastIndex)
|
|
same = (prevRop.index == rop.index
|
|
and prevRop.lastIndex == rop.lastIndex)
|
|
|
|
# Delete special case of replace (text==null):
|
|
# D.i-j.u D.x-y.v| boundaries overlapcombine to
|
|
# max(min)..max(right)
|
|
if prevRop.text is None and rop.text is None and not disjoint:
|
|
# kill first delete
|
|
rewrites[prevRop.instructionIndex] = None
|
|
|
|
rop.index = min(prevRop.index, rop.index)
|
|
rop.lastIndex = max(prevRop.lastIndex, rop.lastIndex)
|
|
|
|
elif not disjoint and not same:
|
|
raise ValueError(
|
|
"replace op boundaries of %s overlap with previous %s"
|
|
% (rop, prevRop))
|
|
|
|
# WALK INSERTS
|
|
for i, iop in enumerate(rewrites):
|
|
if iop is None:
|
|
continue
|
|
|
|
if not isinstance(iop, InsertBeforeOp):
|
|
continue
|
|
|
|
# combine current insert with prior if any at same index
|
|
for j, prevIop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
|
|
if prevIop.index == iop.index: # combine objects
|
|
# convert to strings...we're in process of toString'ing
|
|
# whole token buffer so no lazy eval issue with any
|
|
# templates
|
|
iop.text = self.catOpText(iop.text, prevIop.text)
|
|
# delete redundant prior insert
|
|
rewrites[j] = None
|
|
|
|
# look for replaces where iop.index is in range; error
|
|
for j, rop in self.getKindOfOps(rewrites, ReplaceOp, i):
|
|
if iop.index == rop.index:
|
|
rop.text = self.catOpText(iop.text, rop.text)
|
|
# delete current insert
|
|
rewrites[i] = None
|
|
continue
|
|
|
|
if iop.index >= rop.index and iop.index <= rop.lastIndex:
|
|
raise ValueError(
|
|
"insert op %s within boundaries of previous %s"
|
|
% (iop, rop))
|
|
|
|
m = {}
|
|
for i, op in enumerate(rewrites):
|
|
if op is None:
|
|
# ignore deleted ops
|
|
continue
|
|
|
|
assert op.index not in m, "should only be one op per index"
|
|
m[op.index] = op
|
|
|
|
return m
|
|
|
|
|
|
def catOpText(self, a, b):
|
|
x = ""
|
|
y = ""
|
|
if a is not None:
|
|
x = a
|
|
if b is not None:
|
|
y = b
|
|
return x + y
|
|
|
|
|
|
def getKindOfOps(self, rewrites, kind, before=None):
|
|
"""Get all operations before an index of a particular kind."""
|
|
|
|
if before is None:
|
|
before = len(rewrites)
|
|
elif before > len(rewrites):
|
|
before = len(rewrites)
|
|
|
|
for i, op in enumerate(rewrites[:before]):
|
|
if op is None:
|
|
# ignore deleted
|
|
continue
|
|
if op.__class__ == kind:
|
|
yield i, op
|
|
|
|
|
|
def toDebugString(self, start=None, end=None):
|
|
if start is None:
|
|
start = self.MIN_TOKEN_INDEX
|
|
if end is None:
|
|
end = self.size() - 1
|
|
|
|
buf = StringIO()
|
|
i = start
|
|
while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
|
|
buf.write(self.get(i))
|
|
i += 1
|
|
|
|
return buf.getvalue()
|