326 lines
9.1 KiB
Perl
326 lines
9.1 KiB
Perl
package ANTLR::Runtime::Lexer;
|
|
|
|
use English qw( -no_match_vars );
|
|
use Readonly;
|
|
use Carp;
|
|
use Switch;
|
|
|
|
use ANTLR::Runtime::Token;
|
|
use ANTLR::Runtime::CommonToken;
|
|
use ANTLR::Runtime::CharStream;
|
|
use ANTLR::Runtime::MismatchedTokenException;
|
|
|
|
use Moose;
|
|
|
|
extends 'ANTLR::Runtime::BaseRecognizer';
|
|
with 'ANTLR::Runtime::TokenSource';
|
|
|
|
has 'input' => (
|
|
is => 'rw',
|
|
does => 'ANTLR::Runtime::CharStream',
|
|
);
|
|
|
|
sub reset {
|
|
my ($self) = @_;
|
|
|
|
# reset all recognizer state variables
|
|
$self->SUPER::reset();
|
|
|
|
# wack Lexer state variables
|
|
if (defined $self->input) {
|
|
# rewind the input
|
|
$self->input->seek(0);
|
|
}
|
|
|
|
if (defined $self->state) {
|
|
$self->state->token(undef);
|
|
$self->state->type(ANTLR::Runtime::Token->INVALID_TOKEN_TYPE);
|
|
$self->state->channel(ANTLR::Runtime::Token->DEFAULT_CHANNEL);
|
|
$self->state->token_start_char_index(-1);
|
|
$self->state->token_start_char_position_in_line(-1);
|
|
$self->state->start_line(-1);
|
|
$self->state->text(undef);
|
|
}
|
|
}
|
|
|
|
# Return a token from this source; i.e., match a token on the char
|
|
# stream.
|
|
sub next_token {
|
|
my ($self) = @_;
|
|
|
|
while (1) {
|
|
$self->state->token(undef);
|
|
$self->state->channel(ANTLR::Runtime::Token->DEFAULT_CHANNEL);
|
|
$self->state->token_start_char_index($self->input->index());
|
|
$self->state->token_start_char_position_in_line($self->input->get_char_position_in_line());
|
|
$self->state->token_start_line($self->input->get_line());
|
|
$self->state->text(undef);
|
|
|
|
if ($self->input->LA(1) eq ANTLR::Runtime::CharStream->EOF) {
|
|
return ANTLR::Runtime::Token->EOF_TOKEN;
|
|
}
|
|
|
|
my $rv;
|
|
my $op = '';
|
|
eval {
|
|
$self->m_tokens();
|
|
if (!defined $self->state->token) {
|
|
$self->emit();
|
|
}
|
|
elsif ($self->state->token == ANTLR::Runtime::Token->SKIP_TOKEN) {
|
|
$op = 'next';
|
|
return;
|
|
}
|
|
$op = 'return';
|
|
$rv = $self->state->token;
|
|
};
|
|
return $rv if $op eq 'return';
|
|
next if $op eq 'next';
|
|
|
|
if ($EVAL_ERROR) {
|
|
my $exception = $EVAL_ERROR;
|
|
if ($exception->isa('ANTLR::Runtime::RecognitionException')) {
|
|
$self->report_error($exception);
|
|
$self->recover($exception);
|
|
} else {
|
|
croak $exception;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# Instruct the lexer to skip creating a token for current lexer rule
|
|
# and look for another token. nextToken() knows to keep looking when
|
|
# a lexer rule finishes with token set to SKIP_TOKEN. Recall that
|
|
# if token==null at end of any token rule, it creates one for you
|
|
# and emits it.
|
|
sub skip {
|
|
my ($self) = @_;
|
|
|
|
$self->state->token(ANTLR::Runtime::Token->SKIP_TOKEN);
|
|
return;
|
|
}
|
|
|
|
# This is the lexer entry point that sets instance var 'token'
|
|
sub m_tokens {
|
|
croak "Unimplemented";
|
|
}
|
|
|
|
# Set the char stream and reset the lexer
|
|
sub set_char_stream {
|
|
my ($self, $input) = @_;
|
|
|
|
$self->input(undef);
|
|
$self->reset();
|
|
$self->input($input);
|
|
}
|
|
|
|
sub get_char_stream {
|
|
my ($self) = @_;
|
|
return $self->input;
|
|
}
|
|
|
|
sub get_source_name {
|
|
my ($self) = @_;
|
|
return $self->input->get_source_name();
|
|
}
|
|
|
|
sub emit {
|
|
if (@_ == 1) {
|
|
my ($self) = @_;
|
|
# The standard method called to automatically emit a token at the
|
|
# outermost lexical rule. The token object should point into the
|
|
# char buffer start..stop. If there is a text override in 'text',
|
|
# use that to set the token's text. Override this method to emit
|
|
# custom Token objects.
|
|
my $t = ANTLR::Runtime::CommonToken->new({
|
|
input => $self->input,
|
|
type => $self->state->type,
|
|
channel => $self->state->channel,
|
|
start => $self->state->token_start_char_index,
|
|
stop => $self->get_char_index() - 1
|
|
});
|
|
|
|
$t->set_line($self->state->token_start_line);
|
|
$t->set_text($self->state->text);
|
|
$t->set_char_position_in_line($self->state->token_start_char_position_in_line);
|
|
$self->emit($t);
|
|
return $t;
|
|
} elsif (@_ == 2) {
|
|
my ($self, $token) = @_;
|
|
# Currently does not support multiple emits per nextToken invocation
|
|
# for efficiency reasons. Subclass and override this method and
|
|
# nextToken (to push tokens into a list and pull from that list rather
|
|
# than a single variable as this implementation does).
|
|
$self->state->token($token);
|
|
}
|
|
}
|
|
|
|
sub match {
|
|
my ($self, $s) = @_;
|
|
|
|
foreach my $c (split //, $s) {
|
|
if ($self->input->LA(1) ne $c) {
|
|
if ($self->state->backtracking > 0) {
|
|
$self->state->failed(1);
|
|
return;
|
|
}
|
|
my $mte = ANTLR::Runtime::MismatchedTokenException->new({
|
|
expecting => $c,
|
|
input => $self->input
|
|
});
|
|
$self->recover($mte);
|
|
croak $mte;
|
|
}
|
|
$self->input->consume();
|
|
$self->state->failed(0);
|
|
}
|
|
}
|
|
|
|
sub match_any {
|
|
my ($self) = @_;
|
|
|
|
$self->input->consume();
|
|
}
|
|
|
|
sub match_range {
|
|
my ($self, $a, $b) = @_;
|
|
|
|
if ($self->input->LA(1) lt $a || $self->input->LA(1) gt $b) {
|
|
if ($self->state->backtracking > 0) {
|
|
$self->state->failed(1);
|
|
return;
|
|
}
|
|
|
|
my $mre = ANTLR::Runtime::MismatchedRangeException($a, $b, $self->input);
|
|
$self->recover($mre);
|
|
croak $mre;
|
|
}
|
|
|
|
$self->input->consume();
|
|
$self->state->failed(0);
|
|
}
|
|
|
|
sub get_line {
|
|
my ($self) = @_;
|
|
|
|
return $self->input->get_line();
|
|
}
|
|
|
|
sub get_char_position_in_line {
|
|
my ($self) = @_;
|
|
|
|
return $self->input->get_char_position_in_line();
|
|
}
|
|
|
|
# What is the index of the current character of lookahead?
|
|
sub get_char_index {
|
|
my ($self) = @_;
|
|
|
|
return $self->input->index();
|
|
}
|
|
|
|
# Return the text matched so far for the current token or any
|
|
# text override.
|
|
sub get_text {
|
|
my ($self) = @_;
|
|
|
|
if (defined $self->state->text) {
|
|
return $self->state->text;
|
|
}
|
|
return $self->input->substring($self->state->token_start_char_index, $self->get_char_index() - 1);
|
|
}
|
|
|
|
# Set the complete text of this token; it wipes any previous
|
|
# changes to the text.
|
|
sub set_text {
|
|
my ($self, $text) = @_;
|
|
|
|
$self->state->text($text);
|
|
}
|
|
|
|
sub report_error {
|
|
Readonly my $usage => 'void report_error(RecognitionException e)';
|
|
croak $usage if @_ != 2;
|
|
my ($self, $e) = @_;
|
|
|
|
$self->display_recognition_error($self->get_token_names(), $e);
|
|
}
|
|
|
|
sub get_error_message {
|
|
my ($self, $e, $token_names) = @_;
|
|
|
|
my $msg;
|
|
if ($e->isa('ANTLR::Runtime::MismatchedTokenException')) {
|
|
$msg = 'mismatched character '
|
|
. $self->get_char_error_display($e->get_c())
|
|
. ' expecting '
|
|
. $self->get_char_error_display($e->expecting);
|
|
} elsif ($e->isa('ANTLR::Runtime::NoViableAltException')) {
|
|
$msg = 'no viable alternative at character ' . $self->get_char_error_display($e->get_c());
|
|
} elsif ($e->isa('ANTLR::Runtime::EarlyExitException')) {
|
|
$msg = 'required (...)+ loop did not match anything at character '
|
|
. $self->get_char_error_display($e->get_c());
|
|
} elsif ($e->isa('ANTLR::Runtime::MismatchedSetException')) {
|
|
$msg = 'mismatched character ' . $self->get_char_error_display($e->get_c())
|
|
. ' expecting set ' . $e->expecting;
|
|
} elsif ($e->isa('ANTLR::Runtime::MismatchedNotSetException')) {
|
|
$msg = 'mismatched character ' . $self->get_char_error_display($e->get_c())
|
|
. ' expecting set ' . $e->expecting;
|
|
} elsif ($e->isa('ANTLR::Runtime::MismatchedRangeException')) {
|
|
$msg = 'mismatched character ' . $self->get_char_error_display($e->get_c())
|
|
. ' expecting set ' . $self->get_char_error_display($e->a)
|
|
. '..' . $self->get_char_error_display($e->b);
|
|
} else {
|
|
$msg = $self->SUPER::get_error_message($e, $token_names);
|
|
}
|
|
return $msg;
|
|
}
|
|
|
|
sub get_char_error_display {
|
|
my ($self, $c) = @_;
|
|
|
|
my $s;
|
|
if ($c eq ANTLR::Runtime::Token->EOF) {
|
|
$s = '<EOF>';
|
|
} elsif ($c eq "\n") {
|
|
$s = '\n';
|
|
} elsif ($c eq "\t") {
|
|
$s = '\t';
|
|
} elsif ($c eq "\r") {
|
|
$s = '\r';
|
|
} else {
|
|
$s = $c;
|
|
}
|
|
|
|
return "'$s'";
|
|
}
|
|
|
|
# Lexers can normally match any char in it's vocabulary after matching
|
|
# a token, so do the easy thing and just kill a character and hope
|
|
# it all works out. You can instead use the rule invocation stack
|
|
# to do sophisticated error recovery if you are in a fragment rule.
|
|
sub recover {
|
|
my ($self, $re) = @_;
|
|
|
|
$self->input->consume();
|
|
}
|
|
|
|
sub trace_in {
|
|
my ($self, $rule_name, $rule_index) = @_;
|
|
|
|
my $input_symbol = $self->input->LT(1) . ' line=' . $self->get_line() . ':' . $self->get_char_position_in_line();
|
|
$self->SUPER::trace_in($rule_name, $rule_index, $input_symbol);
|
|
}
|
|
|
|
sub trace_out {
|
|
my ($self, $rule_name, $rule_index) = @_;
|
|
|
|
my $input_symbol = $self->input->LT(1) . ' line=' . $self->get_line() . ':' . $self->get_char_position_in_line();
|
|
$self->SUPER::trace_out($rule_name, $rule_index, $input_symbol);
|
|
}
|
|
|
|
no Moose;
|
|
__PACKAGE__->meta->make_immutable();
|
|
1;
|