278 lines
		
	
	
		
			8.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			278 lines
		
	
	
		
			8.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
		
			Executable File
		
	
	
| /*
 | |
|  [The "BSD licence"]
 | |
|  Copyright (c) 2004 Terence Parr and Loring Craymer
 | |
|  All rights reserved.
 | |
| 
 | |
|  Redistribution and use in source and binary forms, with or without
 | |
|  modification, are permitted provided that the following conditions
 | |
|  are met:
 | |
|  1. Redistributions of source code must retain the above copyright
 | |
|     notice, this list of conditions and the following disclaimer.
 | |
|  2. Redistributions in binary form must reproduce the above copyright
 | |
|     notice, this list of conditions and the following disclaimer in the
 | |
|     documentation and/or other materials provided with the distribution.
 | |
|  3. The name of the author may not be used to endorse or promote products
 | |
|     derived from this software without specific prior written permission.
 | |
| 
 | |
|  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 | |
|  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 | |
|  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 | |
|  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 | |
|  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 | |
|  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 | |
|  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 | |
|  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | |
|  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 | |
|  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| */
 | |
| 
 | |
| 
 | |
| /** Python does not explicitly provide begin and end nesting signals.
 | |
|  Rather, the indentation level indicates when you begin and end.
 | |
|  This is an interesting lexical problem because multiple DEDENT
 | |
|  tokens should be sent to the parser sometimes without a corresponding
 | |
|  input symbol!  Consider the following example:
 | |
| 
 | |
|  a=1
 | |
|  if a>1:
 | |
|      print a
 | |
|  b=3
 | |
| 
 | |
|  Here the "b" token on the left edge signals that a DEDENT is needed
 | |
|  after the "print a \n" and before the "b".  The sequence should be
 | |
| 
 | |
|  ... 1 COLON NEWLINE INDENT PRINT a NEWLINE DEDENT b ASSIGN 3 ...
 | |
| 
 | |
|  For more examples, see the big comment at the bottom of this file.
 | |
| 
 | |
|  This TokenStream normally just passes tokens through to the parser.
 | |
|  Upon NEWLINE token from the lexer, however, an INDENT or DEDENT token
 | |
|  may need to be sent to the parser.  The NEWLINE is the trigger for
 | |
|  this class to do it's job.  NEWLINE is saved and then the first token
 | |
|  of the next line is examined.  If non-leading-whitespace token,
 | |
|  then check against stack for indent vs dedent.  If LEADING_WS, then
 | |
|  the column of the next non-whitespace token will dictate indent vs
 | |
|  dedent.  The column of the next real token is number of spaces
 | |
|  in the LEADING_WS token + 1 (to move past the whitespace).  The
 | |
|  lexer grammar must set the text of the LEADING_WS token to be
 | |
|  the proper number of spaces (and do tab conversion etc...).
 | |
| 
 | |
|  A stack of column numbers is tracked and used to detect changes
 | |
|  in indent level from one token to the next.
 | |
| 
 | |
|  A queue of tokens is built up to hold multiple DEDENT tokens that
 | |
|  are generated.  Before asking the lexer for another token via
 | |
|  nextToken(), the queue is flushed first one token at a time.
 | |
| 
 | |
|  Terence Parr and Loring Craymer
 | |
|  February 2004
 | |
|  */
 | |
| PythonTokenSource = function(stream) {
 | |
|     this.stream = stream;
 | |
| 	/** The stack of indent levels (column numbers) */
 | |
| 	this.indentStack = new Array(PythonTokenSource.MAX_INDENTS);
 | |
| 	/** stack pointer */
 | |
| 	this.sp=-1; // grow upwards
 | |
| 
 | |
| 	/** The queue of tokens */
 | |
| 	this.tokens = [];
 | |
| 	this.lastTokenAddedIndex = -1;
 | |
| 	this.push(PythonTokenSource.FIRST_CHAR_POSITION);
 | |
| };
 | |
| 
 | |
| ANTLR.lang.augmentObject(PythonTokenSource, {
 | |
| 	MAX_INDENTS: 100,
 | |
| 	FIRST_CHAR_POSITION: 0,
 | |
| });
 | |
| 
 | |
| PythonTokenSource.prototype = {
 | |
| 	getSourceName: function() {
 | |
| 		return this.stream.getSourceName();
 | |
| 	},
 | |
| 
 | |
| 	/** From http://www.python.org/doc/2.2.3/ref/indentation.html
 | |
| 
 | |
| 	 "Before the first line of the file is read, a single zero is
 | |
| 	 pushed on the stack; this will never be popped off again. The
 | |
| 	 numbers pushed on the stack will always be strictly increasing
 | |
| 	 from bottom to top. At the beginning of each logical line, the
 | |
| 	 line's indentation level is compared to the top of the
 | |
| 	 stack. If it is equal, nothing happens. If it is larger, it is
 | |
| 	 pushed on the stack, and one INDENT token is generated. If it
 | |
| 	 is smaller, it must be one of the numbers occurring on the
 | |
| 	 stack; all numbers on the stack that are larger are popped
 | |
| 	 off, and for each number popped off a DEDENT token is
 | |
| 	 generated. At the end of the file, a DEDENT token is generated
 | |
| 	 for each number remaining on the stack that is larger than
 | |
| 	 zero."
 | |
| 
 | |
| 	 I use char position in line 0..n-1 instead.
 | |
| 
 | |
| 	 The DEDENTS possibly needed at EOF are gracefully handled by forcing
 | |
| 	 EOF to have char pos 0 even though with UNIX it's hard to get EOF
 | |
| 	 at a non left edge.
 | |
| 	 */
 | |
| 	nextToken: function() {
 | |
| 		// if something in queue, just remove and return it
 | |
| 		if (this.tokens.length>0 ) {
 | |
| 			var t = this.tokens[0];
 | |
| 			this.tokens.splice(0,1);
 | |
| 			return t;
 | |
| 		}
 | |
| 
 | |
| 		this.insertImaginaryIndentDedentTokens();
 | |
| 
 | |
| 		return this.nextToken();
 | |
| 	},
 | |
| 
 | |
| 	insertImaginaryIndentDedentTokens: function()
 | |
| 	{
 | |
| 		var t = this.stream.LT(1);
 | |
| 		this.stream.consume();
 | |
| 
 | |
| 		// if not a NEWLINE, doesn't signal indent/dedent work; just enqueue
 | |
| 		if ( t.getType()!=PythonLexer.NEWLINE ) {
 | |
| 			var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
 | |
| 			if ( hiddenTokens!=null ) {
 | |
| 				this.tokens = this.tokens.concat(hiddenTokens);
 | |
| 			}
 | |
| 			this.lastTokenAddedIndex = t.getTokenIndex();
 | |
| 			this.tokens.push(t);
 | |
| 			return;
 | |
| 		}
 | |
| 
 | |
| 		// save NEWLINE in the queue
 | |
| 		var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
 | |
| 		if ( hiddenTokens!=null ) {
 | |
| 			this.tokens = this.tokens.concat(hiddenTokens);
 | |
| 		}
 | |
| 		this.lastTokenAddedIndex = t.getTokenIndex();
 | |
| 		this.tokens.push(t);
 | |
| 
 | |
| 		// grab first token of next line
 | |
| 		t = this.stream.LT(1);
 | |
| 		this.stream.consume();
 | |
| 
 | |
| 		hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
 | |
| 		if ( hiddenTokens!=null ) {
 | |
| 			this.tokens = this.tokens.concat(hiddenTokens);
 | |
| 		}
 | |
| 		this.lastTokenAddedIndex = t.getTokenIndex();
 | |
| 
 | |
| 		// compute cpos as the char pos of next non-WS token in line
 | |
| 		var cpos = t.getCharPositionInLine(); // column dictates indent/dedent
 | |
| 		if ( t.getType()==ANTLR.runtime.Token.EOF ) {
 | |
| 			cpos = -1; // pretend EOF always happens at left edge
 | |
| 		}
 | |
| 		else if ( t.getType()==PythonLexer.LEADING_WS ) {
 | |
| 			cpos = t.getText().length;
 | |
| 		}
 | |
| 
 | |
| 		// compare to last indent level
 | |
| 		var lastIndent = this.peek();
 | |
| 		if ( cpos > lastIndent ) { // they indented; track and gen INDENT
 | |
| 			this.push(cpos);
 | |
| 			var indent = new ANTLR.runtime.CommonToken(PythonParser.INDENT, "");
 | |
| 			indent.setCharPositionInLine(t.getCharPositionInLine());
 | |
| 			indent.setLine(t.getLine());
 | |
| 			this.tokens.push(indent);
 | |
| 		}
 | |
| 		else if ( cpos < lastIndent ) { // they dedented
 | |
| 			// how far back did we dedent?
 | |
| 			var prevIndex = this.findPreviousIndent(cpos);
 | |
| 			// generate DEDENTs for each indent level we backed up over
 | |
| 			for (var d=this.sp-1; d>=prevIndex; d--) {
 | |
| 				var dedent = new ANTLR.runtime.CommonToken(PythonParser.DEDENT, "");
 | |
| 				dedent.setCharPositionInLine(t.getCharPositionInLine());
 | |
| 				dedent.setLine(t.getLine());
 | |
| 				this.tokens.push(dedent);
 | |
| 			}
 | |
| 			this.sp = prevIndex; // pop those off indent level
 | |
| 		}
 | |
| 		if ( t.getType()!=PythonLexer.LEADING_WS ) { // discard WS
 | |
| 			this.tokens.push(t);
 | |
| 		}
 | |
| 	},
 | |
| 
 | |
| 	//  T O K E N  S T A C K  M E T H O D S
 | |
| 
 | |
| 	push: function(i) {
 | |
| 		if (this.sp>=PythonTokenSource.MAX_INDENTS) {
 | |
| 			throw new Error("stack overflow");
 | |
| 		}
 | |
| 		this.sp++;
 | |
| 		this.indentStack[this.sp] = i;
 | |
| 	},
 | |
| 
 | |
| 	pop: function() {
 | |
| 		if (this.sp<0) {
 | |
| 			throw new Error("stack underflow");
 | |
| 		}
 | |
| 		var top = this.indentStack[this.sp];
 | |
| 		this.sp--;
 | |
| 		return top;
 | |
| 	},
 | |
| 
 | |
| 	peek: function() {
 | |
| 		return this.indentStack[this.sp];
 | |
| 	},
 | |
| 
 | |
| 	/** Return the index on stack of previous indent level == i else -1 */
 | |
| 	findPreviousIndent: function(i) {
 | |
| 		for (var j=this.sp-1; j>=0; j--) {
 | |
| 			if (this.indentStack[j]==i ) {
 | |
| 				return j;
 | |
| 			}
 | |
| 		}
 | |
| 		return PythonTokenSource.FIRST_CHAR_POSITION;
 | |
| 	},
 | |
| 
 | |
| 	stackString: function() {
 | |
| 		var buf = [];
 | |
| 		for (var j=this.sp; j>=0; j--) {
 | |
| 			buf.push(this.indentStack[j]);
 | |
| 		}
 | |
| 		return buf.join(" ");
 | |
| 	}
 | |
| 
 | |
| }
 | |
| 
 | |
| /* More example input / output pairs with code simplified to single chars
 | |
| ------- t1 -------
 | |
| a a
 | |
|         b b
 | |
|         c
 | |
| d
 | |
| a a \n INDENT b b \n c \n DEDENT d \n EOF
 | |
| ------- t2 -------
 | |
| a  c
 | |
|  b
 | |
| c
 | |
| a c \n INDENT b \n DEDENT c \n EOF 
 | |
| ------- t3 -------
 | |
| a
 | |
|         b
 | |
|                 c
 | |
| d
 | |
| a \n INDENT b \n INDENT c \n DEDENT DEDENT d \n EOF 
 | |
| ------- t4 -------
 | |
| a
 | |
|     c
 | |
|                   d
 | |
|     e
 | |
|     f
 | |
|              g
 | |
|              h
 | |
|              i
 | |
|               j
 | |
|     k
 | |
| a \n INDENT c \n INDENT d \n DEDENT e \n f \n INDENT g \n h \n i \n INDENT j \n DEDENT DEDENT k \n DEDENT EOF 
 | |
| ------- t5 -------
 | |
| a
 | |
|         b
 | |
|         c
 | |
|                 d
 | |
|                 e
 | |
| a \n INDENT b \n c \n INDENT d \n e \n DEDENT DEDENT EOF 
 | |
| */
 |