1352 lines
41 KiB
Ruby
1352 lines
41 KiB
Ruby
#!/usr/bin/ruby
|
|
# encoding: utf-8
|
|
|
|
=begin LICENSE
|
|
|
|
[The "BSD licence"]
|
|
Copyright (c) 2009-2010 Kyle Yetter
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
3. The name of the author may not be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
=end
|
|
|
|
module ANTLR3
|
|
unless const_defined?( :RecognizerSharedState )
|
|
|
|
RecognizerSharedState = Struct.new(
|
|
:following,
|
|
:error_recovery,
|
|
:last_error_index,
|
|
:backtracking,
|
|
:rule_memory,
|
|
:syntax_errors,
|
|
:token,
|
|
:token_start_position,
|
|
:token_start_line,
|
|
:token_start_column,
|
|
:channel,
|
|
:type,
|
|
:text
|
|
)
|
|
|
|
=begin rdoc ANTLR3::RecognizerSharedState
|
|
|
|
A big Struct-based class containing most of the data that makes up a
|
|
recognizer's state. These attributes are externalized from the recognizer itself
|
|
so that recognizer delegation (which occurs when you import other grammars into
|
|
your grammar) can function; multiple recognizers can share a common state.
|
|
|
|
== Structure Attributes
|
|
|
|
following::
|
|
a stack that tracks follow sets for error recovery
|
|
error_recovery::
|
|
a flag indicating whether or not the recognizer is in error recovery mode
|
|
last_error_index::
|
|
the index in the input stream of the last error
|
|
backtracking::
|
|
tracks the backtracking depth
|
|
rule_memory::
|
|
if a grammar is compiled with the memoization option, this will be
|
|
set to a hash mapping previously parsed rules to cached indices
|
|
syntax_errors::
|
|
tracks the number of syntax errors seen so far
|
|
token::
|
|
holds newly constructed tokens for lexer rules
|
|
token_start_position::
|
|
the input stream index at which the token starts
|
|
token_start_line::
|
|
the input stream line number at which the token starts
|
|
token_start_column::
|
|
the input stream column at which the token starts
|
|
channel::
|
|
the channel value of the target token
|
|
type::
|
|
the type value of the target token
|
|
text::
|
|
the text of the target token
|
|
|
|
=end
|
|
|
|
class RecognizerSharedState
|
|
def initialize
|
|
super( [], false, -1, 0, nil, 0, nil, -1 )
|
|
# ^-- same as this --v
|
|
# self.following = []
|
|
# self.error_recovery = false
|
|
# self.last_error_index = -1
|
|
# self.backtracking = 0
|
|
# self.syntax_errors = 0
|
|
# self.token_start_position = -1
|
|
end
|
|
|
|
|
|
# restores all of the state variables to their respective
|
|
# initial default values
|
|
def reset!
|
|
self.following.clear
|
|
self.error_recovery = false
|
|
self.last_error_index = -1
|
|
self.backtracking = 0
|
|
self.rule_memory and rule_memory.clear
|
|
self.syntax_errors = 0
|
|
self.token = nil
|
|
self.token_start_position = -1
|
|
self.token_start_line = nil
|
|
self.token_start_column = nil
|
|
self.channel = nil
|
|
self.type = nil
|
|
self.text = nil
|
|
end
|
|
end
|
|
|
|
end # unless const_defined?( :RecognizerSharedState )
|
|
|
|
=begin rdoc ANTLR3::Recognizer
|
|
|
|
= Scope
|
|
|
|
Scope is used to represent instances of ANTLR's various attribute scopes.
|
|
It is identical to Ruby's built-in Struct class, but it takes string
|
|
attribute declarations from the ANTLR grammar as parameters, and overrides
|
|
the #initialize method to set the default values if any are present in
|
|
the scope declaration.
|
|
|
|
Block = Scope.new( "name", "depth = 0", "variables = {}" )
|
|
Block.new # => #<struct Block name=nil, depth=0, variables={}>
|
|
Block.new( "function" ) # => #<struct Block name="function", depth=0, variables={}>
|
|
Block.new( 'a', 1, :x => 3 ) # => #<struct Block name="a", depth=1, variables={ :x => 3 }>
|
|
|
|
=end
|
|
|
|
class Scope < ::Struct
|
|
def self.new( *declarations, &body )
|
|
names = []
|
|
defaults = {}
|
|
for decl in declarations
|
|
name, default = decl.to_s.split( /\s*=\s*/, 2 )
|
|
names << ( name = name.to_sym )
|
|
default and defaults[ name ] = default
|
|
end
|
|
super( *names ) do
|
|
|
|
# If no defaults, leave the initialize method the same as
|
|
# the struct's default initialize for speed. Otherwise,
|
|
# overwrite the initialize to populate with default values.
|
|
unless defaults.empty?
|
|
parameters = names.map do | name |
|
|
"#{ name } = " << defaults.fetch( name, 'nil' )
|
|
end.join( ', ' )
|
|
class_eval( <<-END )
|
|
def initialize( #{ parameters } )
|
|
super( #{ names.join( ', ' ) } )
|
|
end
|
|
END
|
|
end
|
|
|
|
body and class_eval( &body )
|
|
end
|
|
end
|
|
end
|
|
|
|
=begin rdoc ANTLR3::Recognizer
|
|
|
|
= Recognizer
|
|
|
|
As the base class of all ANTLR-generated recognizers, Recognizer provides
|
|
much of the shared functionality and structure used in the recognition process.
|
|
For all effective purposes, the class and its immediate subclasses Lexer,
|
|
Parser, and TreeParser are abstract classes. They can be instantiated, but
|
|
they're pretty useless on their own. Instead, to make useful code, you write an
|
|
ANTLR grammar and ANTLR will generate classes which inherit from one of the
|
|
recognizer base classes, providing the implementation of the grammar rules
|
|
itself. this group of classes to implement necessary tasks. Recognizer
|
|
defines methods related to:
|
|
|
|
* token and character matching
|
|
* prediction and recognition strategy
|
|
* recovering from errors
|
|
* reporting errors
|
|
* memoization
|
|
* simple rule tracing and debugging
|
|
|
|
=end
|
|
|
|
class Recognizer
|
|
include Constants
|
|
include Error
|
|
include TokenFactory
|
|
extend ClassMacros
|
|
|
|
@rules = {}
|
|
|
|
# inherited class methods and hooks
|
|
class << self
|
|
attr_reader :grammar_file_name,
|
|
:antlr_version,
|
|
:antlr_version_string,
|
|
:library_version_string,
|
|
:grammar_home
|
|
|
|
attr_accessor :token_scheme, :default_rule
|
|
|
|
# generated recognizer code uses this method to stamp
|
|
# the code with the name of the grammar file and
|
|
# the current version of ANTLR being used to generate
|
|
# the code
|
|
def generated_using( grammar_file, antlr_version, library_version = nil )
|
|
@grammar_file_name = grammar_file.freeze
|
|
@antlr_version_string = antlr_version.freeze
|
|
@library_version = Util.parse_version( library_version )
|
|
if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/
|
|
@antlr_version = [ $1, $2, $3, $4 ].map! { |str| str.to_i }
|
|
timestamp = $5.strip
|
|
#@antlr_release_time = $5.empty? ? nil : Time.parse($5)
|
|
else
|
|
raise "bad version string: %p" % version_string
|
|
end
|
|
end
|
|
|
|
# this method is used to generate return-value structures for
|
|
# rules with multiple return values. To avoid generating
|
|
# a special class for ever rule in AST parsers and such
|
|
# (where most rules have the same default set of return values),
|
|
# each recognizer gets a default return value structure
|
|
# assigned to the constant +Return+. Rules which don't
|
|
# require additional custom members will have a rule-return
|
|
# name constant that just points to the generic return
|
|
# value.
|
|
def define_return_scope( *members )
|
|
if members.empty? then generic_return_scope
|
|
else
|
|
members += return_scope_members
|
|
Struct.new( *members )
|
|
end
|
|
end
|
|
|
|
# used as a hook to add additional default members
|
|
# to default return value structures
|
|
# For example, all AST-building parsers override
|
|
# this method to add an extra +:tree+ field to
|
|
# all rule return structures.
|
|
def return_scope_members
|
|
[ :start, :stop ]
|
|
end
|
|
|
|
# sets up and returns the generic rule return
|
|
# scope for a recognizer
|
|
def generic_return_scope
|
|
@generic_return_scope ||= begin
|
|
struct = Struct.new( *return_scope_members )
|
|
const_set( :Return, struct )
|
|
end
|
|
end
|
|
|
|
def imported_grammars
|
|
@imported_grammars ||= Set.new
|
|
end
|
|
|
|
def master_grammars
|
|
@master_grammars ||= []
|
|
end
|
|
|
|
def master
|
|
master_grammars.last
|
|
end
|
|
|
|
def masters( *grammar_names )
|
|
for grammar in grammar_names
|
|
unless master_grammars.include?( grammar )
|
|
master_grammars << grammar
|
|
attr_reader( Util.snake_case( grammar ) )
|
|
end
|
|
end
|
|
end
|
|
private :masters
|
|
|
|
def imports( *grammar_names )
|
|
for grammar in grammar_names
|
|
imported_grammars.add?( grammar.to_sym ) and
|
|
attr_reader( Util.snake_case( grammar ) )
|
|
end
|
|
return imported_grammars
|
|
end
|
|
private :imports
|
|
|
|
def rules
|
|
self::RULE_METHODS.dup rescue []
|
|
end
|
|
|
|
def default_rule
|
|
@default_rule ||= rules.first
|
|
end
|
|
|
|
def debug?
|
|
return false
|
|
end
|
|
|
|
def profile?
|
|
return false
|
|
end
|
|
|
|
def Scope( *declarations, &body )
|
|
Scope.new( *declarations, &body )
|
|
end
|
|
|
|
def token_class
|
|
@token_class ||= begin
|
|
self::Token rescue
|
|
superclass.token_class rescue
|
|
ANTLR3::CommonToken
|
|
end
|
|
end
|
|
private :generated_using
|
|
end
|
|
|
|
@grammar_file_name = nil
|
|
@antlr_version = ANTLR3::ANTLR_VERSION
|
|
@antlr_version_string = ANTLR3::ANTLR_VERSION_STRING
|
|
|
|
def grammar_file_name
|
|
self.class.grammar_file_name
|
|
end
|
|
|
|
def antlr_version
|
|
self.class.antlr_version
|
|
end
|
|
|
|
def antlr_version_string
|
|
self.class.antlr_version_string
|
|
end
|
|
|
|
attr_accessor :input
|
|
attr_reader :state
|
|
|
|
def each_delegate
|
|
block_given? or return enum_for( __method__ )
|
|
for grammar in self.class.imported_grammars
|
|
del = __send__( Util.snake_case( grammar ) ) and
|
|
yield( del )
|
|
end
|
|
end
|
|
|
|
# Create a new recognizer. The constructor simply ensures that
|
|
# all recognizers are initialized with a shared state object.
|
|
# See the main recognizer subclasses for more specific
|
|
# information about creating recognizer objects like
|
|
# lexers and parsers.
|
|
def initialize( options = {} )
|
|
@state = options[ :state ] || RecognizerSharedState.new
|
|
@error_output = options.fetch( :error_output, $stderr )
|
|
defined?( @input ) or @input = nil
|
|
initialize_dfas
|
|
end
|
|
|
|
# Resets the recognizer's state data to initial values.
|
|
# As a result, all error tracking and error recovery
|
|
# data accumulated in the current state will be cleared.
|
|
# It will also attempt to reset the input stream
|
|
# via input.reset, but it ignores any errors received
|
|
# from doing so. Thus the input stream is not guarenteed
|
|
# to be rewound to its initial position
|
|
def reset
|
|
@state and @state.reset!
|
|
@input and @input.reset rescue nil
|
|
end
|
|
|
|
# Attempt to match the current input symbol the token type
|
|
# specified by +type+. If the symbol matches the type,
|
|
# consume the current symbol and return its value. If
|
|
# the symbol doesn't match, attempt to use the follow-set
|
|
# data provided by +follow+ to recover from the mismatched
|
|
# token.
|
|
def match( type, follow )
|
|
matched_symbol = current_symbol
|
|
if @input.peek == type
|
|
@input.consume
|
|
@state.error_recovery = false
|
|
return matched_symbol
|
|
end
|
|
raise( BacktrackingFailed ) if @state.backtracking > 0
|
|
return recover_from_mismatched_token( type, follow )
|
|
end
|
|
|
|
# match anything -- i.e. wildcard match. Simply consume
|
|
# the current symbol from the input stream.
|
|
def match_any
|
|
@state.error_recovery = false
|
|
@input.consume
|
|
end
|
|
|
|
##############################################################################################
|
|
###################################### Error Reporting #######################################
|
|
##############################################################################################
|
|
##############################################################################################
|
|
|
|
# When a recognition error occurs, this method is the main
|
|
# hook for carrying out the error reporting process. The
|
|
# default implementation calls +display_recognition_error+
|
|
# to display the error info on $stderr.
|
|
def report_error( e = $! )
|
|
@state.error_recovery and return
|
|
@state.syntax_errors += 1
|
|
@state.error_recovery = true
|
|
display_recognition_error( e )
|
|
end
|
|
|
|
# error reporting hook for presenting the information
|
|
# The default implementation builds appropriate error
|
|
# message text using +error_header+ and +error_message+,
|
|
# and calls +emit_error_message+ to write the error
|
|
# message out to some source
|
|
def display_recognition_error( e = $! )
|
|
header = error_header( e )
|
|
message = error_message( e )
|
|
emit_error_message( "#{ header } #{ message }" )
|
|
end
|
|
|
|
# used to construct an appropriate error message
|
|
# based on the specific type of error and the
|
|
# error's attributes
|
|
def error_message( e = $! )
|
|
case e
|
|
when UnwantedToken
|
|
token_name = token_name( e.expecting )
|
|
"extraneous input #{ token_error_display( e.unexpected_token ) } expecting #{ token_name }"
|
|
when MissingToken
|
|
token_name = token_name( e.expecting )
|
|
"missing #{ token_name } at #{ token_error_display( e.symbol ) }"
|
|
when MismatchedToken
|
|
token_name = token_name( e.expecting )
|
|
"mismatched input #{ token_error_display( e.symbol ) } expecting #{ token_name }"
|
|
when MismatchedTreeNode
|
|
token_name = token_name( e.expecting )
|
|
"mismatched tree node: #{ e.symbol } expecting #{ token_name }"
|
|
when NoViableAlternative
|
|
"no viable alternative at input " << token_error_display( e.symbol )
|
|
when MismatchedSet
|
|
"mismatched input %s expecting set %s" %
|
|
[ token_error_display( e.symbol ), e.expecting.inspect ]
|
|
when MismatchedNotSet
|
|
"mismatched input %s expecting set %s" %
|
|
[ token_error_display( e.symbol ), e.expecting.inspect ]
|
|
when FailedPredicate
|
|
"rule %s failed predicate: { %s }?" % [ e.rule_name, e.predicate_text ]
|
|
else e.message
|
|
end
|
|
end
|
|
|
|
#
|
|
# used to add a tag to the error message that indicates
|
|
# the location of the input stream when the error
|
|
# occurred
|
|
#
|
|
def error_header( e = $! )
|
|
e.location
|
|
end
|
|
|
|
#
|
|
# formats a token object appropriately for inspection
|
|
# within an error message
|
|
#
|
|
def token_error_display( token )
|
|
unless text = token.text || ( token.source_text rescue nil )
|
|
text =
|
|
case
|
|
when token.type == EOF then '<EOF>'
|
|
when name = token_name( token.type ) rescue nil then "<#{ name }>"
|
|
when token.respond_to?( :name ) then "<#{ token.name }>"
|
|
else "<#{ token.type }>"
|
|
end
|
|
end
|
|
return text.inspect
|
|
end
|
|
|
|
#
|
|
# Write the error report data out to some source. By default,
|
|
# the error message is written to $stderr
|
|
#
|
|
def emit_error_message( message )
|
|
@error_output.puts( message ) if @error_output
|
|
end
|
|
|
|
##############################################################################################
|
|
###################################### Error Recovery ########################################
|
|
##############################################################################################
|
|
|
|
def recover( error = $! )
|
|
@state.last_error_index == @input.index and @input.consume
|
|
@state.last_error_index = @input.index
|
|
|
|
follow_set = compute_error_recovery_set
|
|
|
|
resync { consume_until( follow_set ) }
|
|
end
|
|
|
|
def resync
|
|
begin_resync
|
|
return( yield )
|
|
ensure
|
|
end_resync
|
|
end
|
|
|
|
# overridable hook method that is executed at the start of the
|
|
# resyncing procedure in recover
|
|
#
|
|
# by default, it does nothing
|
|
def begin_resync
|
|
# do nothing
|
|
end
|
|
|
|
# overridable hook method that is after the resyncing procedure has completed
|
|
#
|
|
# by default, it does nothing
|
|
def end_resync
|
|
# do nothing
|
|
end
|
|
|
|
# (The following explanation has been lifted directly from the
|
|
# source code documentation of the ANTLR Java runtime library)
|
|
#
|
|
# Compute the error recovery set for the current rule. During
|
|
# rule invocation, the parser pushes the set of tokens that can
|
|
# follow that rule reference on the stack; this amounts to
|
|
# computing FIRST of what follows the rule reference in the
|
|
# enclosing rule. This local follow set only includes tokens
|
|
# from within the rule; i.e., the FIRST computation done by
|
|
# ANTLR stops at the end of a rule.
|
|
#
|
|
# EXAMPLE
|
|
#
|
|
# When you find a "no viable alt exception", the input is not
|
|
# consistent with any of the alternatives for rule r. The best
|
|
# thing to do is to consume tokens until you see something that
|
|
# can legally follow a call to r *or* any rule that called r.
|
|
# You don't want the exact set of viable next tokens because the
|
|
# input might just be missing a token--you might consume the
|
|
# rest of the input looking for one of the missing tokens.
|
|
#
|
|
# Consider grammar:
|
|
#
|
|
# a : '[' b ']'
|
|
# | '(' b ')'
|
|
# ;
|
|
# b : c '^' INT ;
|
|
# c : ID
|
|
# | INT
|
|
# ;
|
|
#
|
|
# At each rule invocation, the set of tokens that could follow
|
|
# that rule is pushed on a stack. Here are the various "local"
|
|
# follow sets:
|
|
#
|
|
# FOLLOW( b1_in_a ) = FIRST( ']' ) = ']'
|
|
# FOLLOW( b2_in_a ) = FIRST( ')' ) = ')'
|
|
# FOLLOW( c_in_b ) = FIRST( '^' ) = '^'
|
|
#
|
|
# Upon erroneous input "[]", the call chain is
|
|
#
|
|
# a -> b -> c
|
|
#
|
|
# and, hence, the follow context stack is:
|
|
#
|
|
# depth local follow set after call to rule
|
|
# 0 \<EOF> a (from main( ) )
|
|
# 1 ']' b
|
|
# 3 '^' c
|
|
#
|
|
# Notice that <tt>')'</tt> is not included, because b would have to have
|
|
# been called from a different context in rule a for ')' to be
|
|
# included.
|
|
#
|
|
# For error recovery, we cannot consider FOLLOW(c)
|
|
# (context-sensitive or otherwise). We need the combined set of
|
|
# all context-sensitive FOLLOW sets--the set of all tokens that
|
|
# could follow any reference in the call chain. We need to
|
|
# resync to one of those tokens. Note that FOLLOW(c)='^' and if
|
|
# we resync'd to that token, we'd consume until EOF. We need to
|
|
# sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
|
|
# In this case, for input "[]", LA(1) is in this set so we would
|
|
# not consume anything and after printing an error rule c would
|
|
# return normally. It would not find the required '^' though.
|
|
# At this point, it gets a mismatched token error and throws an
|
|
# exception (since LA(1) is not in the viable following token
|
|
# set). The rule exception handler tries to recover, but finds
|
|
# the same recovery set and doesn't consume anything. Rule b
|
|
# exits normally returning to rule a. Now it finds the ']' (and
|
|
# with the successful match exits errorRecovery mode).
|
|
#
|
|
# So, you cna see that the parser walks up call chain looking
|
|
# for the token that was a member of the recovery set.
|
|
#
|
|
# Errors are not generated in errorRecovery mode.
|
|
#
|
|
# ANTLR's error recovery mechanism is based upon original ideas:
|
|
#
|
|
# "Algorithms + Data Structures = Programs" by Niklaus Wirth
|
|
#
|
|
# and
|
|
#
|
|
# "A note on error recovery in recursive descent parsers":
|
|
# http://portal.acm.org/citation.cfm?id=947902.947905
|
|
#
|
|
# Later, Josef Grosch had some good ideas:
|
|
#
|
|
# "Efficient and Comfortable Error Recovery in Recursive Descent
|
|
# Parsers":
|
|
# ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
|
|
#
|
|
# Like Grosch I implemented local FOLLOW sets that are combined
|
|
# at run-time upon error to avoid overhead during parsing.
|
|
def compute_error_recovery_set
|
|
combine_follows( false )
|
|
end
|
|
|
|
def recover_from_mismatched_token( type, follow )
|
|
if mismatch_is_unwanted_token?( type )
|
|
err = UnwantedToken( type )
|
|
resync { @input.consume }
|
|
report_error( err )
|
|
|
|
return @input.consume
|
|
end
|
|
|
|
if mismatch_is_missing_token?( follow )
|
|
inserted = missing_symbol( nil, type, follow )
|
|
report_error( MissingToken( type, inserted ) )
|
|
return inserted
|
|
end
|
|
|
|
raise MismatchedToken( type )
|
|
end
|
|
|
|
def recover_from_mismatched_set( e, follow )
|
|
if mismatch_is_missing_token?( follow )
|
|
report_error( e )
|
|
return missing_symbol( e, INVALID_TOKEN_TYPE, follow )
|
|
end
|
|
raise e
|
|
end
|
|
|
|
def recover_from_mismatched_element( e, follow )
|
|
follow.nil? and return false
|
|
if follow.include?( EOR_TOKEN_TYPE )
|
|
viable_tokens = compute_context_sensitive_rule_follow
|
|
follow = ( follow | viable_tokens ) - Set[ EOR_TOKEN_TYPE ]
|
|
end
|
|
if follow.include?( @input.peek )
|
|
report_error( e )
|
|
return true
|
|
end
|
|
return false
|
|
end
|
|
|
|
# Conjure up a missing token during error recovery.
|
|
#
|
|
# The recognizer attempts to recover from single missing
|
|
# symbols. But, actions might refer to that missing symbol.
|
|
# For example, x=ID {f($x);}. The action clearly assumes
|
|
# that there has been an identifier matched previously and that
|
|
# $x points at that token. If that token is missing, but
|
|
# the next token in the stream is what we want we assume that
|
|
# this token is missing and we keep going. Because we
|
|
# have to return some token to replace the missing token,
|
|
# we have to conjure one up. This method gives the user control
|
|
# over the tokens returned for missing tokens. Mostly,
|
|
# you will want to create something special for identifier
|
|
# tokens. For literals such as '{' and ',', the default
|
|
# action in the parser or tree parser works. It simply creates
|
|
# a CommonToken of the appropriate type. The text will be the token.
|
|
# If you change what tokens must be created by the lexer,
|
|
# override this method to create the appropriate tokens.
|
|
def missing_symbol( error, expected_token_type, follow )
|
|
return nil
|
|
end
|
|
|
|
def mismatch_is_unwanted_token?( type )
|
|
@input.peek( 2 ) == type
|
|
end
|
|
|
|
def mismatch_is_missing_token?( follow )
|
|
follow.nil? and return false
|
|
if follow.include?( EOR_TOKEN_TYPE )
|
|
viable_tokens = compute_context_sensitive_rule_follow
|
|
follow = follow | viable_tokens
|
|
|
|
follow.delete( EOR_TOKEN_TYPE ) unless @state.following.empty?
|
|
end
|
|
if follow.include?( @input.peek ) or follow.include?( EOR_TOKEN_TYPE )
|
|
return true
|
|
end
|
|
return false
|
|
end
|
|
|
|
def syntax_errors?
|
|
( error_count = @state.syntax_errors ) > 0 and return( error_count )
|
|
end
|
|
|
|
# factor out what to do upon token mismatch so
|
|
# tree parsers can behave differently.
|
|
#
|
|
# * override this method in your parser to do things
|
|
# like bailing out after the first error
|
|
# * just raise the exception instead of
|
|
# calling the recovery method.
|
|
#
|
|
def number_of_syntax_errors
|
|
@state.syntax_errors
|
|
end
|
|
|
|
#
|
|
# Compute the context-sensitive +FOLLOW+ set for current rule.
|
|
# This is set of token types that can follow a specific rule
|
|
# reference given a specific call chain. You get the set of
|
|
# viable tokens that can possibly come next (look depth 1)
|
|
# given the current call chain. Contrast this with the
|
|
# definition of plain FOLLOW for rule r:
|
|
#
|
|
# FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
|
|
#
|
|
# where x in T* and alpha, beta in V*; T is set of terminals and
|
|
# V is the set of terminals and nonterminals. In other words,
|
|
# FOLLOW(r) is the set of all tokens that can possibly follow
|
|
# references to r in *any* sentential form (context). At
|
|
# runtime, however, we know precisely which context applies as
|
|
# we have the call chain. We may compute the exact (rather
|
|
# than covering superset) set of following tokens.
|
|
#
|
|
# For example, consider grammar:
|
|
#
|
|
# stat : ID '=' expr ';' // FOLLOW(stat)=={EOF}
|
|
# | "return" expr '.'
|
|
# ;
|
|
# expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'}
|
|
# atom : INT // FOLLOW(atom)=={'+',')',';','.'}
|
|
# | '(' expr ')'
|
|
# ;
|
|
#
|
|
# The FOLLOW sets are all inclusive whereas context-sensitive
|
|
# FOLLOW sets are precisely what could follow a rule reference.
|
|
# For input input "i=(3);", here is the derivation:
|
|
#
|
|
# stat => ID '=' expr ';'
|
|
# => ID '=' atom ('+' atom)* ';'
|
|
# => ID '=' '(' expr ')' ('+' atom)* ';'
|
|
# => ID '=' '(' atom ')' ('+' atom)* ';'
|
|
# => ID '=' '(' INT ')' ('+' atom)* ';'
|
|
# => ID '=' '(' INT ')' ';'
|
|
#
|
|
# At the "3" token, you'd have a call chain of
|
|
#
|
|
# stat -> expr -> atom -> expr -> atom
|
|
#
|
|
# What can follow that specific nested ref to atom? Exactly ')'
|
|
# as you can see by looking at the derivation of this specific
|
|
# input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
|
|
#
|
|
# You want the exact viable token set when recovering from a
|
|
# token mismatch. Upon token mismatch, if LA(1) is member of
|
|
# the viable next token set, then you know there is most likely
|
|
# a missing token in the input stream. "Insert" one by just not
|
|
# throwing an exception.
|
|
#
|
|
def compute_context_sensitive_rule_follow
|
|
combine_follows true
|
|
end
|
|
|
|
def combine_follows( exact )
|
|
follow_set = Set.new
|
|
@state.following.each_with_index.reverse_each do |local_follow_set, index|
|
|
follow_set |= local_follow_set
|
|
if exact
|
|
if local_follow_set.include?( EOR_TOKEN_TYPE )
|
|
follow_set.delete( EOR_TOKEN_TYPE ) if index > 0
|
|
else
|
|
break
|
|
end
|
|
end
|
|
end
|
|
return follow_set
|
|
end
|
|
|
|
#
|
|
# Match needs to return the current input symbol, which gets put
|
|
# into the label for the associated token ref; e.g., x=ID. Token
|
|
# and tree parsers need to return different objects. Rather than test
|
|
# for input stream type or change the IntStream interface, I use
|
|
# a simple method to ask the recognizer to tell me what the current
|
|
# input symbol is.
|
|
#
|
|
# This is ignored for lexers.
|
|
#
|
|
def current_symbol
|
|
@input.look
|
|
end
|
|
|
|
#
|
|
# Consume input symbols until one matches a type within types
|
|
#
|
|
# types can be a single symbol type or a set of symbol types
|
|
#
|
|
def consume_until( types )
|
|
types.is_a?( Set ) or types = Set[ *types ]
|
|
type = @input.peek
|
|
until type == EOF or types.include?( type )
|
|
@input.consume
|
|
type = @input.peek
|
|
end
|
|
return( type )
|
|
end
|
|
|
|
#
|
|
# Returns true if the recognizer is currently in a decision for which
|
|
# backtracking has been enabled
|
|
#
|
|
def backtracking?
|
|
@state.backtracking > 0
|
|
end
|
|
|
|
def backtracking_level
|
|
@state.backtracking
|
|
end
|
|
|
|
def backtracking_level=( n )
|
|
@state.backtracking = n
|
|
end
|
|
|
|
def backtrack
|
|
@state.backtracking += 1
|
|
start = @input.mark
|
|
success =
|
|
begin yield
|
|
rescue BacktrackingFailed then false
|
|
else true
|
|
end
|
|
return success
|
|
ensure
|
|
@input.rewind( start )
|
|
@state.backtracking -= 1
|
|
end
|
|
|
|
def syntactic_predicate?( name )
|
|
backtrack { send name }
|
|
end
|
|
|
|
alias backtracking backtracking_level
|
|
alias backtracking= backtracking_level=
|
|
|
|
def rule_memoization( rule, start_index )
|
|
@state.rule_memory.fetch( rule ) do
|
|
@state.rule_memory[ rule ] = Hash.new( MEMO_RULE_UNKNOWN )
|
|
end[ start_index ]
|
|
end
|
|
|
|
def already_parsed_rule?( rule )
|
|
stop_index = rule_memoization( rule, @input.index )
|
|
case stop_index
|
|
when MEMO_RULE_UNKNOWN then return false
|
|
when MEMO_RULE_FAILED
|
|
raise BacktrackingFailed
|
|
else
|
|
@input.seek( stop_index + 1 )
|
|
end
|
|
return true
|
|
end
|
|
|
|
def memoize( rule, start_index, success )
|
|
stop_index = success ? @input.index - 1 : MEMO_RULE_FAILED
|
|
memo = @state.rule_memory[ rule ] and memo[ start_index ] = stop_index
|
|
end
|
|
|
|
def trace_in( rule_name, rule_index, input_symbol )
|
|
@error_output.printf( "--> enter %s on %s", rule_name, input_symbol )
|
|
@state.backtracking > 0 and @error_output.printf(
|
|
" (in backtracking mode: depth = %s)", @state.backtracking
|
|
)
|
|
@error_output.print( "\n" )
|
|
end
|
|
|
|
def trace_out( rule_name, rule_index, input_symbol )
|
|
@error_output.printf( "<-- exit %s on %s", rule_name, input_symbol )
|
|
@state.backtracking > 0 and @error_output.printf(
|
|
" (in backtracking mode: depth = %s)", @state.backtracking
|
|
)
|
|
@error_output.print( "\n" )
|
|
end
|
|
|
|
private
|
|
|
|
def initialize_dfas
|
|
# do nothing
|
|
end
|
|
end
|
|
|
|
|
|
# constant alias for compatibility with older versions of the
|
|
# runtime library
|
|
BaseRecognizer = Recognizer
|
|
|
|
=begin rdoc ANTLR3::Lexer
|
|
|
|
= Lexer
|
|
|
|
Lexer is the default superclass of all lexers generated by ANTLR. The class
|
|
tailors the core functionality provided by Recognizer to the task of
|
|
matching patterns in the text input and breaking the input into tokens.
|
|
|
|
== About Lexers
|
|
|
|
A lexer's job is to take input text and break it up into _tokens_ -- objects
|
|
that encapsulate a piece of text, a type label (such as ID or INTEGER), and the
|
|
position of the text with respect to the input. Thus, a lexer is essentially a
|
|
complicated iterator that steps through an input stream and produces a sequence
|
|
of tokens. Sometimes lexers are enough to carry out a goal on their own, such as
|
|
tasks like source code highlighting and simple code analysis. Usually, however,
|
|
the lexer converts text into tokens for use by a parser, which recognizes larger
|
|
structures within the text.
|
|
|
|
ANTLR parsers have a variety of entry points specified by parser rules, each of
|
|
which defines the structure of a specific type of sentence in a grammar. Lexers,
|
|
however, are primarily intended to have a single entry point. It looks at the
|
|
characters starting at the current input position, decides if the chunk of text
|
|
matches one of a number of possible token type definitions, wraps the chunk into
|
|
a token with information on its type and location, and advances the input stream
|
|
to the next place.
|
|
|
|
== ANTLR Lexers and the Lexer API
|
|
|
|
ANTLR-generated lexers will subclass this class, unless specified otherwise
|
|
within a grammar file. The generated class will provide an implementation of
|
|
each lexer rule as a method of the same name. The subclass will also provide an
|
|
implementation for the abstract method #m_tokens, the purpose of which is to
|
|
multiplex the token type definitions and predict what rule definition to execute
|
|
to fetch a token. The primary method in the lexer API, #next_token, uses
|
|
#m_tokens to fetch the next token and drive the iteration.
|
|
|
|
If the lexer is preparing tokens for use by an ANTLR generated parser, the lexer
|
|
will generally be used to build a TokenStream object. The following code example
|
|
demonstrates the typical setup for using ANTLR parsers and lexers in Ruby.
|
|
|
|
# in HypotheticalLexer.rb
|
|
module Hypothetical
|
|
class Lexer < ANTLR3::Lexer
|
|
# ...
|
|
# ANTLR generated code
|
|
# ...
|
|
end
|
|
end
|
|
|
|
# in HypotheticalParser.rb
|
|
module Hypothetical
|
|
class Parser < ANTLR3::Parser
|
|
# ...
|
|
# more ANTLR generated code
|
|
# ...
|
|
end
|
|
end
|
|
|
|
# to take hypothetical source code and prepare it for parsing,
|
|
# there is generally a four-step construction process
|
|
|
|
source = "some hypothetical source code"
|
|
input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp')
|
|
lexer = Hypothetical::Lexer.new( input )
|
|
tokens = ANTLR3::CommonTokenStream.new( lexer )
|
|
parser = Hypothetical::Parser.new( tokens )
|
|
|
|
# if you're using the standard streams, ANTLR3::StringStream and
|
|
# ANTLR3::CommonTokenStream, you can write the same process
|
|
# shown above more succinctly:
|
|
|
|
lexer = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp')
|
|
parser = Hypothetical::Parser.new( lexer )
|
|
|
|
=end
|
|
class Lexer < Recognizer
|
|
include TokenSource
|
|
@token_class = CommonToken
|
|
|
|
def self.default_rule
|
|
@default_rule ||= :token!
|
|
end
|
|
|
|
def self.main( argv = ARGV, options = {} )
|
|
if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
|
|
main = ANTLR3::Main::LexerMain.new( self, options )
|
|
block_given? ? yield( main ) : main.execute( argv )
|
|
end
|
|
|
|
def self.associated_parser
|
|
@associated_parser ||= begin
|
|
@grammar_home and @grammar_home::Parser
|
|
rescue NameError
|
|
grammar_name = @grammar_home.name.split( "::" ).last
|
|
begin
|
|
require "#{ grammar_name }Parser"
|
|
@grammar_home::Parser
|
|
rescue LoadError, NameError
|
|
end
|
|
end
|
|
end
|
|
|
|
def initialize( input, options = {} )
|
|
super( options )
|
|
@input = cast_input( input, options )
|
|
end
|
|
|
|
def current_symbol
|
|
nil
|
|
end
|
|
|
|
def next_token
|
|
loop do
|
|
@state.token = nil
|
|
@state.channel = DEFAULT_CHANNEL
|
|
@state.token_start_position = @input.index
|
|
@state.token_start_column = @input.column
|
|
@state.token_start_line = @input.line
|
|
@state.text = nil
|
|
@input.peek == EOF and return EOF_TOKEN
|
|
begin
|
|
token!
|
|
|
|
case token = @state.token
|
|
when nil then return( emit )
|
|
when SKIP_TOKEN then next
|
|
else
|
|
return token
|
|
end
|
|
rescue NoViableAlternative => re
|
|
report_error( re )
|
|
recover( re )
|
|
rescue Error::RecognitionError => re
|
|
report_error( re )
|
|
end
|
|
end
|
|
end
|
|
|
|
def skip
|
|
@state.token = SKIP_TOKEN
|
|
end
|
|
|
|
abstract :token!
|
|
|
|
def exhaust
|
|
self.to_a
|
|
end
|
|
|
|
def char_stream=( input )
|
|
@input = nil
|
|
reset()
|
|
@input = input
|
|
end
|
|
|
|
def source_name
|
|
@input.source_name
|
|
end
|
|
|
|
def emit( token = @state.token )
|
|
token ||= create_token
|
|
@state.token = token
|
|
return token
|
|
end
|
|
|
|
def match( expected )
|
|
case expected
|
|
when String
|
|
expected.each_byte do |char|
|
|
unless @input.peek == char
|
|
@state.backtracking > 0 and raise BacktrackingFailed
|
|
error = MismatchedToken( char )
|
|
recover( error )
|
|
raise error
|
|
end
|
|
@input.consume()
|
|
end
|
|
else # single integer character
|
|
unless @input.peek == expected
|
|
@state.backtracking > 0 and raise BacktrackingFailed
|
|
error = MismatchedToken( expected )
|
|
recover( error )
|
|
raise error
|
|
end
|
|
@input.consume
|
|
end
|
|
return true
|
|
end
|
|
|
|
def match_any
|
|
@input.consume
|
|
end
|
|
|
|
def match_range( min, max )
|
|
char = @input.peek
|
|
if char.between?( min, max ) then @input.consume
|
|
else
|
|
@state.backtracking > 0 and raise BacktrackingFailed
|
|
error = MismatchedRange( min.chr, max.chr )
|
|
recover( error )
|
|
raise( error )
|
|
end
|
|
return true
|
|
end
|
|
|
|
def line
|
|
@input.line
|
|
end
|
|
|
|
def column
|
|
@input.column
|
|
end
|
|
|
|
def character_index
|
|
@input.index
|
|
end
|
|
|
|
def text
|
|
@state.text and return @state.text
|
|
@input.substring( @state.token_start_position, character_index - 1 )
|
|
end
|
|
|
|
def text=( text )
|
|
@state.text = text
|
|
end
|
|
|
|
def report_error( e )
|
|
display_recognition_error( e )
|
|
end
|
|
|
|
def error_message( e )
|
|
char = character_error_display( e.symbol ) rescue nil
|
|
case e
|
|
when Error::MismatchedToken
|
|
expecting = character_error_display( e.expecting )
|
|
"mismatched character #{ char }; expecting #{ expecting }"
|
|
when Error::NoViableAlternative
|
|
"no viable alternative at character #{ char }"
|
|
when Error::EarlyExit
|
|
"required ( ... )+ loop did not match anything at character #{ char }"
|
|
when Error::MismatchedNotSet
|
|
"mismatched character %s; expecting set %p" % [ char, e.expecting ]
|
|
when Error::MismatchedSet
|
|
"mismatched character %s; expecting set %p" % [ char, e.expecting ]
|
|
when Error::MismatchedRange
|
|
a = character_error_display( e.min )
|
|
b = character_error_display( e.max )
|
|
"mismatched character %s; expecting set %s..%s" % [ char, a, b ]
|
|
else super
|
|
end
|
|
end
|
|
|
|
def character_error_display( char )
|
|
case char
|
|
when EOF then '<EOF>'
|
|
when Integer then char.chr.inspect
|
|
else char.inspect
|
|
end
|
|
end
|
|
|
|
def recover( re )
|
|
@input.consume
|
|
end
|
|
|
|
alias input= char_stream=
|
|
|
|
private
|
|
|
|
def cast_input( input, options )
|
|
case input
|
|
when CharacterStream then input
|
|
when ::String then StringStream.new( input, options )
|
|
when ::IO, ARGF then FileStream.new( input, options )
|
|
else input
|
|
end
|
|
end
|
|
|
|
def trace_in( rule_name, rule_index )
|
|
if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
|
|
else symbol = '<EOF>' end
|
|
input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
|
|
super( rule_name, rule_index, input_symbol )
|
|
end
|
|
|
|
def trace_out( rule_name, rule_index )
|
|
if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
|
|
else symbol = '<EOF>' end
|
|
input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
|
|
super( rule_name, rule_index, input_symbol )
|
|
end
|
|
|
|
def create_token( &b )
|
|
if block_given? then super( &b )
|
|
else
|
|
super do |t|
|
|
t.input = @input
|
|
t.type = @state.type
|
|
t.channel = @state.channel
|
|
t.start = @state.token_start_position
|
|
t.stop = @input.index - 1
|
|
t.line = @state.token_start_line
|
|
t.text = self.text
|
|
t.column = @state.token_start_column
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
=begin rdoc ANTLR3::Parser
|
|
|
|
= Parser
|
|
|
|
Parser is the default base class of ANTLR-generated parser classes. The class
|
|
tailors the functionality provided by Recognizer to the task of parsing.
|
|
|
|
== About Parsing
|
|
|
|
This is just a lose overview of parsing. For considerably more in-depth coverage
|
|
of the topic, read the ANTLR documentation or check out the ANTLR website
|
|
(http://www.antlr.org).
|
|
|
|
A grammar defines the vocabulary and the sentence structure of a language. While
|
|
a lexer concerns the basic vocabulary symbols of the language, a parser's
|
|
primary task is to implement the sentence structure.
|
|
|
|
Parsers are set up by providing a stream of tokens, which is usually created by
|
|
a corresponding lexer. Then, the user requests a specific sentence-structure
|
|
within the grammar, such as "class_definition" or "xml_node", from the parser.
|
|
It iterates through the tokens, verifying the syntax of the sentence and
|
|
performing actions specified by the grammar. It stops when it encounters an
|
|
error or when it has matched the full sentence according to its defined
|
|
structure.
|
|
|
|
== ANTLR Parsers and the Parser API
|
|
|
|
Plain ANTLR-generated parsers directly subclass this class, unless specified
|
|
otherwise within the grammar options. The generated code will provide a method
|
|
for each parser rule defined in the ANTLR grammar, as well as any other
|
|
customized member attributes and methods specified in the source grammar.
|
|
|
|
This class does not override much of the functionality in Recognizer, and
|
|
thus the API closely mirrors Recognizer.
|
|
|
|
=end
|
|
class Parser < Recognizer
|
|
def self.main( argv = ARGV, options = {} )
|
|
if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
|
|
main = ANTLR3::Main::ParserMain.new( self, options )
|
|
block_given? ? yield( main ) : main.execute( argv )
|
|
end
|
|
|
|
def self.associated_lexer
|
|
@associated_lexer ||= begin
|
|
@grammar_home and @grammar_home::Lexer
|
|
rescue NameError
|
|
grammar_name = @grammar_home.name.split( "::" ).last
|
|
begin
|
|
require "#{ grammar_name }Lexer"
|
|
@grammar_home::Lexer
|
|
rescue LoadError, NameError
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
def initialize( input, options = {} )
|
|
super( options )
|
|
@input = nil
|
|
reset
|
|
@input = cast_input( input, options )
|
|
end
|
|
|
|
def missing_symbol( error, expected_type, follow )
|
|
current = @input.look
|
|
current = @input.look( -1 ) if current == ANTLR3::EOF_TOKEN
|
|
t =
|
|
case
|
|
when current && current != ANTLR3::EOF_TOKEN then current.clone
|
|
when @input.token_class then @input.token_class.new
|
|
else ( create_token rescue CommonToken.new )
|
|
end
|
|
|
|
t.type = expected_type
|
|
name = t.name.gsub( /(^<)|(>$)/,'' )
|
|
t.text = "<missing #{ name }>"
|
|
t.channel = DEFAULT_CHANNEL
|
|
return( t )
|
|
end
|
|
|
|
def token_stream=( input )
|
|
@input = nil
|
|
reset
|
|
@input = input
|
|
end
|
|
alias token_stream input
|
|
|
|
def source_name
|
|
@input.source_name
|
|
end
|
|
|
|
|
|
private
|
|
|
|
def trace_in( rule_name, rule_index )
|
|
super( rule_name, rule_index, @input.look.inspect )
|
|
end
|
|
|
|
def trace_out( rule_name, rule_index )
|
|
super( rule_name, rule_index, @input.look.inspect )
|
|
end
|
|
|
|
def cast_input( input, options )
|
|
case input
|
|
when TokenStream then input
|
|
when TokenSource then CommonTokenStream.new( input, options )
|
|
when IO, String, CharacterStream
|
|
if lexer_class = self.class.associated_lexer
|
|
CommonTokenStream.new( lexer_class.new( input, options ), options )
|
|
else
|
|
raise ArgumentError, Util.tidy( <<-END, true )
|
|
| unable to automatically convert input #{ input.inspect }
|
|
| to a ANTLR3::TokenStream object as #{ self.class }
|
|
| does not appear to have an associated lexer class
|
|
END
|
|
end
|
|
else
|
|
# assume it's a stream if it at least implements peek and consume
|
|
unless input.respond_to?( :peek ) and input.respond_to?( :consume )
|
|
raise ArgumentError, Util.tidy( <<-END, true )
|
|
| #{ self.class } requires a token stream as input, but
|
|
| #{ input.inspect } was provided
|
|
END
|
|
end
|
|
input
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
end
|