433 lines
12 KiB
Rust
433 lines
12 KiB
Rust
use std::char;
|
|
use std::cmp::Ordering;
|
|
use std::fmt;
|
|
use std::ops;
|
|
use std::u32;
|
|
|
|
use crate::literal::LiteralSearcher;
|
|
use crate::prog::InstEmptyLook;
|
|
use crate::utf8::{decode_last_utf8, decode_utf8};
|
|
|
|
/// Represents a location in the input.
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub struct InputAt {
|
|
pos: usize,
|
|
c: Char,
|
|
byte: Option<u8>,
|
|
len: usize,
|
|
}
|
|
|
|
impl InputAt {
|
|
/// Returns true iff this position is at the beginning of the input.
|
|
pub fn is_start(&self) -> bool {
|
|
self.pos == 0
|
|
}
|
|
|
|
/// Returns true iff this position is past the end of the input.
|
|
pub fn is_end(&self) -> bool {
|
|
self.c.is_none() && self.byte.is_none()
|
|
}
|
|
|
|
/// Returns the character at this position.
|
|
///
|
|
/// If this position is just before or after the input, then an absent
|
|
/// character is returned.
|
|
pub fn char(&self) -> Char {
|
|
self.c
|
|
}
|
|
|
|
/// Returns the byte at this position.
|
|
pub fn byte(&self) -> Option<u8> {
|
|
self.byte
|
|
}
|
|
|
|
/// Returns the UTF-8 width of the character at this position.
|
|
pub fn len(&self) -> usize {
|
|
self.len
|
|
}
|
|
|
|
/// Returns whether the UTF-8 width of the character at this position
|
|
/// is zero.
|
|
pub fn is_empty(&self) -> bool {
|
|
self.len == 0
|
|
}
|
|
|
|
/// Returns the byte offset of this position.
|
|
pub fn pos(&self) -> usize {
|
|
self.pos
|
|
}
|
|
|
|
/// Returns the byte offset of the next position in the input.
|
|
pub fn next_pos(&self) -> usize {
|
|
self.pos + self.len
|
|
}
|
|
}
|
|
|
|
/// An abstraction over input used in the matching engines.
|
|
pub trait Input: fmt::Debug {
|
|
/// Return an encoding of the position at byte offset `i`.
|
|
fn at(&self, i: usize) -> InputAt;
|
|
|
|
/// Return the Unicode character occurring next to `at`.
|
|
///
|
|
/// If no such character could be decoded, then `Char` is absent.
|
|
fn next_char(&self, at: InputAt) -> Char;
|
|
|
|
/// Return the Unicode character occurring previous to `at`.
|
|
///
|
|
/// If no such character could be decoded, then `Char` is absent.
|
|
fn previous_char(&self, at: InputAt) -> Char;
|
|
|
|
/// Return true if the given empty width instruction matches at the
|
|
/// input position given.
|
|
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
|
|
|
|
/// Scan the input for a matching prefix.
|
|
fn prefix_at(
|
|
&self,
|
|
prefixes: &LiteralSearcher,
|
|
at: InputAt,
|
|
) -> Option<InputAt>;
|
|
|
|
/// The number of bytes in the input.
|
|
fn len(&self) -> usize;
|
|
|
|
/// Whether the input is empty.
|
|
fn is_empty(&self) -> bool {
|
|
self.len() == 0
|
|
}
|
|
|
|
/// Return the given input as a sequence of bytes.
|
|
fn as_bytes(&self) -> &[u8];
|
|
}
|
|
|
|
impl<'a, T: Input> Input for &'a T {
|
|
fn at(&self, i: usize) -> InputAt {
|
|
(**self).at(i)
|
|
}
|
|
|
|
fn next_char(&self, at: InputAt) -> Char {
|
|
(**self).next_char(at)
|
|
}
|
|
|
|
fn previous_char(&self, at: InputAt) -> Char {
|
|
(**self).previous_char(at)
|
|
}
|
|
|
|
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
|
|
(**self).is_empty_match(at, empty)
|
|
}
|
|
|
|
fn prefix_at(
|
|
&self,
|
|
prefixes: &LiteralSearcher,
|
|
at: InputAt,
|
|
) -> Option<InputAt> {
|
|
(**self).prefix_at(prefixes, at)
|
|
}
|
|
|
|
fn len(&self) -> usize {
|
|
(**self).len()
|
|
}
|
|
|
|
fn as_bytes(&self) -> &[u8] {
|
|
(**self).as_bytes()
|
|
}
|
|
}
|
|
|
|
/// An input reader over characters.
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub struct CharInput<'t>(&'t [u8]);
|
|
|
|
impl<'t> CharInput<'t> {
|
|
/// Return a new character input reader for the given string.
|
|
pub fn new(s: &'t [u8]) -> CharInput<'t> {
|
|
CharInput(s)
|
|
}
|
|
}
|
|
|
|
impl<'t> ops::Deref for CharInput<'t> {
|
|
type Target = [u8];
|
|
|
|
fn deref(&self) -> &[u8] {
|
|
self.0
|
|
}
|
|
}
|
|
|
|
impl<'t> Input for CharInput<'t> {
|
|
fn at(&self, i: usize) -> InputAt {
|
|
if i >= self.len() {
|
|
InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
|
|
} else {
|
|
let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
|
|
InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
|
|
}
|
|
}
|
|
|
|
fn next_char(&self, at: InputAt) -> Char {
|
|
at.char()
|
|
}
|
|
|
|
fn previous_char(&self, at: InputAt) -> Char {
|
|
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
|
|
}
|
|
|
|
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
|
|
use crate::prog::EmptyLook::*;
|
|
match empty.look {
|
|
StartLine => {
|
|
let c = self.previous_char(at);
|
|
at.pos() == 0 || c == '\n'
|
|
}
|
|
EndLine => {
|
|
let c = self.next_char(at);
|
|
at.pos() == self.len() || c == '\n'
|
|
}
|
|
StartText => at.pos() == 0,
|
|
EndText => at.pos() == self.len(),
|
|
WordBoundary => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
c1.is_word_char() != c2.is_word_char()
|
|
}
|
|
NotWordBoundary => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
c1.is_word_char() == c2.is_word_char()
|
|
}
|
|
WordBoundaryAscii => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
c1.is_word_byte() != c2.is_word_byte()
|
|
}
|
|
NotWordBoundaryAscii => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
c1.is_word_byte() == c2.is_word_byte()
|
|
}
|
|
}
|
|
}
|
|
|
|
fn prefix_at(
|
|
&self,
|
|
prefixes: &LiteralSearcher,
|
|
at: InputAt,
|
|
) -> Option<InputAt> {
|
|
prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
|
|
}
|
|
|
|
fn len(&self) -> usize {
|
|
self.0.len()
|
|
}
|
|
|
|
fn as_bytes(&self) -> &[u8] {
|
|
self.0
|
|
}
|
|
}
|
|
|
|
/// An input reader over bytes.
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub struct ByteInput<'t> {
|
|
text: &'t [u8],
|
|
only_utf8: bool,
|
|
}
|
|
|
|
impl<'t> ByteInput<'t> {
|
|
/// Return a new byte-based input reader for the given string.
|
|
pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
|
|
ByteInput { text: text, only_utf8: only_utf8 }
|
|
}
|
|
}
|
|
|
|
impl<'t> ops::Deref for ByteInput<'t> {
|
|
type Target = [u8];
|
|
|
|
fn deref(&self) -> &[u8] {
|
|
self.text
|
|
}
|
|
}
|
|
|
|
impl<'t> Input for ByteInput<'t> {
|
|
fn at(&self, i: usize) -> InputAt {
|
|
if i >= self.len() {
|
|
InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
|
|
} else {
|
|
InputAt {
|
|
pos: i,
|
|
c: None.into(),
|
|
byte: self.get(i).cloned(),
|
|
len: 1,
|
|
}
|
|
}
|
|
}
|
|
|
|
fn next_char(&self, at: InputAt) -> Char {
|
|
decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
|
|
}
|
|
|
|
fn previous_char(&self, at: InputAt) -> Char {
|
|
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
|
|
}
|
|
|
|
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
|
|
use crate::prog::EmptyLook::*;
|
|
match empty.look {
|
|
StartLine => {
|
|
let c = self.previous_char(at);
|
|
at.pos() == 0 || c == '\n'
|
|
}
|
|
EndLine => {
|
|
let c = self.next_char(at);
|
|
at.pos() == self.len() || c == '\n'
|
|
}
|
|
StartText => at.pos() == 0,
|
|
EndText => at.pos() == self.len(),
|
|
WordBoundary => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
c1.is_word_char() != c2.is_word_char()
|
|
}
|
|
NotWordBoundary => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
c1.is_word_char() == c2.is_word_char()
|
|
}
|
|
WordBoundaryAscii => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
if self.only_utf8 {
|
|
// If we must match UTF-8, then we can't match word
|
|
// boundaries at invalid UTF-8.
|
|
if c1.is_none() && !at.is_start() {
|
|
return false;
|
|
}
|
|
if c2.is_none() && !at.is_end() {
|
|
return false;
|
|
}
|
|
}
|
|
c1.is_word_byte() != c2.is_word_byte()
|
|
}
|
|
NotWordBoundaryAscii => {
|
|
let (c1, c2) = (self.previous_char(at), self.next_char(at));
|
|
if self.only_utf8 {
|
|
// If we must match UTF-8, then we can't match word
|
|
// boundaries at invalid UTF-8.
|
|
if c1.is_none() && !at.is_start() {
|
|
return false;
|
|
}
|
|
if c2.is_none() && !at.is_end() {
|
|
return false;
|
|
}
|
|
}
|
|
c1.is_word_byte() == c2.is_word_byte()
|
|
}
|
|
}
|
|
}
|
|
|
|
fn prefix_at(
|
|
&self,
|
|
prefixes: &LiteralSearcher,
|
|
at: InputAt,
|
|
) -> Option<InputAt> {
|
|
prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
|
|
}
|
|
|
|
fn len(&self) -> usize {
|
|
self.text.len()
|
|
}
|
|
|
|
fn as_bytes(&self) -> &[u8] {
|
|
self.text
|
|
}
|
|
}
|
|
|
|
/// An inline representation of `Option<char>`.
|
|
///
|
|
/// This eliminates the need to do case analysis on `Option<char>` to determine
|
|
/// ordinality with other characters.
|
|
///
|
|
/// (The `Option<char>` is not related to encoding. Instead, it is used in the
|
|
/// matching engines to represent the beginning and ending boundaries of the
|
|
/// search text.)
|
|
#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
|
pub struct Char(u32);
|
|
|
|
impl fmt::Debug for Char {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match char::from_u32(self.0) {
|
|
None => write!(f, "Empty"),
|
|
Some(c) => write!(f, "{:?}", c),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Char {
|
|
/// Returns true iff the character is absent.
|
|
#[inline]
|
|
pub fn is_none(self) -> bool {
|
|
self.0 == u32::MAX
|
|
}
|
|
|
|
/// Returns the length of the character's UTF-8 encoding.
|
|
///
|
|
/// If the character is absent, then `1` is returned.
|
|
#[inline]
|
|
pub fn len_utf8(self) -> usize {
|
|
char::from_u32(self.0).map_or(1, |c| c.len_utf8())
|
|
}
|
|
|
|
/// Returns true iff the character is a word character.
|
|
///
|
|
/// If the character is absent, then false is returned.
|
|
pub fn is_word_char(self) -> bool {
|
|
// is_word_character can panic if the Unicode data for \w isn't
|
|
// available. However, our compiler ensures that if a Unicode word
|
|
// boundary is used, then the data must also be available. If it isn't,
|
|
// then the compiler returns an error.
|
|
char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
|
|
}
|
|
|
|
/// Returns true iff the byte is a word byte.
|
|
///
|
|
/// If the byte is absent, then false is returned.
|
|
pub fn is_word_byte(self) -> bool {
|
|
match char::from_u32(self.0) {
|
|
Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
|
|
None | Some(_) => false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<char> for Char {
|
|
fn from(c: char) -> Char {
|
|
Char(c as u32)
|
|
}
|
|
}
|
|
|
|
impl From<Option<char>> for Char {
|
|
fn from(c: Option<char>) -> Char {
|
|
c.map_or(Char(u32::MAX), |c| c.into())
|
|
}
|
|
}
|
|
|
|
impl PartialEq<char> for Char {
|
|
#[inline]
|
|
fn eq(&self, other: &char) -> bool {
|
|
self.0 == *other as u32
|
|
}
|
|
}
|
|
|
|
impl PartialEq<Char> for char {
|
|
#[inline]
|
|
fn eq(&self, other: &Char) -> bool {
|
|
*self as u32 == other.0
|
|
}
|
|
}
|
|
|
|
impl PartialOrd<char> for Char {
|
|
#[inline]
|
|
fn partial_cmp(&self, other: &char) -> Option<Ordering> {
|
|
self.0.partial_cmp(&(*other as u32))
|
|
}
|
|
}
|
|
|
|
impl PartialOrd<Char> for char {
|
|
#[inline]
|
|
fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
|
|
(*self as u32).partial_cmp(&other.0)
|
|
}
|
|
}
|