506 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
			
		
		
	
	
			506 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
| # Copyright (C) 2016 and later: Unicode, Inc. and others.
 | |
| # License & terms of use: http://www.unicode.org/copyright.html
 | |
| #*****************************************************************************
 | |
| #
 | |
| #   Copyright (C) 2002-2015, International Business Machines Corporation and others.
 | |
| #   All Rights Reserved.
 | |
| #
 | |
| #*****************************************************************************
 | |
| #
 | |
| #  file:  regexcst.txt
 | |
| #  ICU Regular Expression Parser State Table
 | |
| #
 | |
| #     This state table is used when reading and parsing a regular expression pattern
 | |
| #     The pattern parser uses a state machine; the data in this file define the
 | |
| #     state transitions that occur for each input character.
 | |
| #
 | |
| #     *** This file defines the regex pattern grammar.   This is it.
 | |
| #     *** The determination of what is accepted is here.
 | |
| #
 | |
| #     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
 | |
| #     that are then built with the rule parser.
 | |
| #
 | |
| 
 | |
| #
 | |
| # Here is the syntax of the state definitions in this file:
 | |
| #
 | |
| #
 | |
| #StateName:
 | |
| #   input-char           n next-state           ^push-state     action
 | |
| #   input-char           n next-state           ^push-state     action
 | |
| #       |                |   |                      |             |
 | |
| #       |                |   |                      |             |--- action to be performed by state machine
 | |
| #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
 | |
| #       |                |   |                      |
 | |
| #       |                |   |                      |--- Push this named state onto the state stack.
 | |
| #       |                |   |                           Later, when next state is specified as "pop",
 | |
| #       |                |   |                           the pushed state will become the current state.
 | |
| #       |                |   |
 | |
| #       |                |   |--- Transition to this state if the current input character matches the input
 | |
| #       |                |        character or char class in the left hand column.  "pop" causes the next
 | |
| #       |                |        state to be popped from the state stack.
 | |
| #       |                |
 | |
| #       |                |--- When making the state transition specified on this line, advance to the next
 | |
| #       |                     character from the input only if 'n' appears here.
 | |
| #       |
 | |
| #       |--- Character or named character classes to test for.  If the current character being scanned
 | |
| #            matches, perform the actions and go to the state specified on this line.
 | |
| #            The input character is tested sequentally, in the order written.  The characters and
 | |
| #            character classes tested for do not need to be mutually exclusive.  The first match wins.
 | |
| #
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| #
 | |
| #  start state, scan position is at the beginning of the pattern.
 | |
| #
 | |
| start:
 | |
|    default                 term                                     doPatStart
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| #
 | |
| #  term.  At a position where we can accept the start most items in a pattern.
 | |
| #
 | |
| term:
 | |
|     quoted               n expr-quant                               doLiteralChar
 | |
|     rule_char            n expr-quant                               doLiteralChar
 | |
|     '['                  n set-open       ^set-finish               doSetBegin
 | |
|     '('                  n open-paren
 | |
|     '.'                  n expr-quant                               doDotAny
 | |
|     '^'                  n expr-quant                               doCaret
 | |
|     '$'                  n expr-quant                               doDollar
 | |
|     '\'                  n backslash
 | |
|     '|'                  n  term                                    doOrOperator
 | |
|     ')'                  n  pop                                     doCloseParen
 | |
|     eof	                   term                                     doPatFinish
 | |
|     default                errorDeath                               doRuleError
 | |
| 
 | |
| 
 | |
| 
 | |
| #
 | |
| #   expr-quant    We've just finished scanning a term, now look for the optional
 | |
| #                 trailing quantifier - *, +, ?, *?,  etc.
 | |
| #
 | |
| expr-quant:
 | |
|     '*'                  n  quant-star
 | |
|     '+'                  n  quant-plus
 | |
|     '?'                  n  quant-opt
 | |
|     '{'                  n  interval-open                          doIntervalInit
 | |
|     '('                  n  open-paren-quant
 | |
|     default                 expr-cont
 | |
| 
 | |
| 
 | |
| #
 | |
| #  expr-cont      Expression, continuation.  At a point where additional terms are
 | |
| #                                            allowed, but not required.  No Quantifiers
 | |
| #
 | |
| expr-cont:
 | |
|     '|'                  n  term                                    doOrOperator
 | |
|     ')'                  n  pop                                     doCloseParen
 | |
|     default                 term
 | |
| 
 | |
| 
 | |
| #
 | |
| #   open-paren-quant   Special case handling for comments appearing before a quantifier,
 | |
| #                        e.g.   x(?#comment )*
 | |
| #                      Open parens from expr-quant come here; anything but a (?# comment
 | |
| #                      branches into the normal parenthesis sequence as quickly as possible.
 | |
| #
 | |
| open-paren-quant:
 | |
|     '?'                  n  open-paren-quant2                      doSuppressComments
 | |
|     default                 open-paren
 | |
| 
 | |
| open-paren-quant2:
 | |
|     '#'                  n  paren-comment   ^expr-quant
 | |
|     default                 open-paren-extended
 | |
| 
 | |
| 
 | |
| #
 | |
| #   open-paren    We've got an open paren.  We need to scan further to
 | |
| #                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
 | |
| #
 | |
| open-paren:
 | |
|     '?'                  n  open-paren-extended                     doSuppressComments
 | |
|     default                 term            ^expr-quant             doOpenCaptureParen
 | |
| 
 | |
| open-paren-extended:
 | |
|     ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
 | |
|     '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
 | |
|     '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
 | |
|     '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
 | |
|     '<'                  n  open-paren-lookbehind
 | |
|     '#'                  n  paren-comment   ^term
 | |
|     'i'                     paren-flag                              doBeginMatchMode
 | |
|     'd'                     paren-flag                              doBeginMatchMode
 | |
|     'm'                     paren-flag                              doBeginMatchMode
 | |
|     's'                     paren-flag                              doBeginMatchMode
 | |
|     'u'                     paren-flag                              doBeginMatchMode
 | |
|     'w'                     paren-flag                              doBeginMatchMode
 | |
|     'x'                     paren-flag                              doBeginMatchMode
 | |
|     '-'                     paren-flag                              doBeginMatchMode
 | |
|     '('                  n  errorDeath                              doConditionalExpr
 | |
|     '{'                  n  errorDeath                              doPerlInline
 | |
|     default                 errorDeath                              doBadOpenParenType
 | |
| 
 | |
| open-paren-lookbehind:
 | |
|     '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
 | |
|     '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
 | |
|     ascii_letter            named-capture                           doBeginNamedCapture    #  (?<name
 | |
|     default                 errorDeath                              doBadOpenParenType
 | |
| 
 | |
| 
 | |
| #
 | |
| #   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
 | |
| #
 | |
| paren-comment:
 | |
|     ')'                  n  pop
 | |
|     eof		                errorDeath                              doMismatchedParenErr
 | |
|     default              n  paren-comment
 | |
| 
 | |
| #
 | |
| #  paren-flag    Scanned a (?ismx-ismx  flag setting
 | |
| #
 | |
| paren-flag:
 | |
|     'i'                  n  paren-flag                              doMatchMode
 | |
|     'd'                  n  paren-flag                              doMatchMode
 | |
|     'm'                  n  paren-flag                              doMatchMode
 | |
|     's'                  n  paren-flag                              doMatchMode
 | |
|     'u'                  n  paren-flag                              doMatchMode
 | |
|     'w'                  n  paren-flag                              doMatchMode
 | |
|     'x'                  n  paren-flag                              doMatchMode
 | |
|     '-'                  n  paren-flag                              doMatchMode
 | |
|     ')'                  n  term                                    doSetMatchMode
 | |
|     ':'                  n  term              ^expr-quant           doMatchModeParen
 | |
|     default                 errorDeath                              doBadModeFlag
 | |
| 
 | |
| #
 | |
| #  named-capture    (?<name> ... ), position currently on the name.
 | |
| #
 | |
| named-capture:
 | |
|     ascii_letter         n  named-capture                           doContinueNamedCapture
 | |
|     digit_char           n  named-capture                           doContinueNamedCapture
 | |
|     '>'                  n  term               ^expr-quant          doOpenCaptureParen      # common w non-named capture.
 | |
|     default                 errorDeath                              doBadNamedCapture
 | |
| 
 | |
| #
 | |
| #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
 | |
| #                 between plain '*', '*?', '*+'
 | |
| #
 | |
| quant-star:
 | |
|      '?'                 n  expr-cont                               doNGStar               #  *?
 | |
|      '+'                 n  expr-cont                               doPossessiveStar       #  *+
 | |
|      default                expr-cont                               doStar
 | |
| 
 | |
| 
 | |
| #
 | |
| #  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
 | |
| #                 between plain '+', '+?', '++'
 | |
| #
 | |
| quant-plus:
 | |
|      '?'                 n  expr-cont                               doNGPlus               #  *?
 | |
|      '+'                 n  expr-cont                               doPossessivePlus       #  *+
 | |
|      default                expr-cont                               doPlus
 | |
| 
 | |
| 
 | |
| #
 | |
| #  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
 | |
| #                  between plain '?', '??', '?+'
 | |
| #
 | |
| quant-opt:
 | |
|      '?'                 n  expr-cont                               doNGOpt                 #  ??
 | |
|      '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
 | |
|      default                expr-cont                               doOpt                   #  ?
 | |
| 
 | |
| 
 | |
| #
 | |
| #   Interval         scanning a '{', the opening delimiter for an interval specification
 | |
| #                                   {number} or {min, max} or {min,}
 | |
| #
 | |
| interval-open:
 | |
|     digit_char              interval-lower
 | |
|     default                 errorDeath                              doIntervalError
 | |
| 
 | |
| interval-lower:
 | |
|     digit_char           n  interval-lower                          doIntevalLowerDigit
 | |
|     ','			         n  interval-upper
 | |
|     '}'                  n  interval-type                           doIntervalSame             # {n}
 | |
|     default                 errorDeath                              doIntervalError
 | |
| 
 | |
| interval-upper:
 | |
|     digit_char           n  interval-upper                          doIntervalUpperDigit
 | |
|     '}'                  n  interval-type
 | |
|     default                 errorDeath                              doIntervalError
 | |
| 
 | |
| interval-type:
 | |
|     '?'                  n  expr-cont                               doNGInterval                # {n,m}?
 | |
|     '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
 | |
|     default                 expr-cont                               doInterval                  # {m,n}
 | |
| 
 | |
| 
 | |
| #
 | |
| #  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
 | |
| #                                  The low level next-char function will have preprocessed
 | |
| #                                  some of them already; those won't come here.
 | |
| backslash:
 | |
|    'A'                   n  term                                    doBackslashA
 | |
|    'B'                   n  term                                    doBackslashB
 | |
|    'b'                   n  term                                    doBackslashb
 | |
|    'd'                   n  expr-quant                              doBackslashd
 | |
|    'D'                   n  expr-quant                              doBackslashD
 | |
|    'G'                   n  term                                    doBackslashG
 | |
|    'h'                   n  expr-quant                              doBackslashh
 | |
|    'H'                   n  expr-quant                              doBackslashH
 | |
|    'k'                   n  named-backref
 | |
|    'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
 | |
|    'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
 | |
|    'P'                      expr-quant                              doProperty
 | |
|    'R'                   n  expr-quant                              doBackslashR
 | |
|    'Q'                   n  term                                    doEnterQuoteMode
 | |
|    'S'                   n  expr-quant                              doBackslashS
 | |
|    's'                   n  expr-quant                              doBackslashs
 | |
|    'v'                   n  expr-quant                              doBackslashv
 | |
|    'V'                   n  expr-quant                              doBackslashV
 | |
|    'W'                   n  expr-quant                              doBackslashW
 | |
|    'w'                   n  expr-quant                              doBackslashw
 | |
|    'X'                   n  expr-quant                              doBackslashX
 | |
|    'Z'                   n  term                                    doBackslashZ
 | |
|    'z'                   n  term                                    doBackslashz
 | |
|    digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
 | |
|    eof                      errorDeath                              doEscapeError
 | |
|    default               n  expr-quant                              doEscapedLiteralChar
 | |
| 
 | |
| 
 | |
| # named-backref   Scanned \k
 | |
| #                 Leading to \k<captureName>
 | |
| #                 Failure to get the full sequence is an error.
 | |
| #
 | |
| named-backref:
 | |
|     '<'                  n  named-backref-2                         doBeginNamedBackRef
 | |
|     default                 errorDeath                              doBadNamedCapture
 | |
| 
 | |
| named-backref-2:
 | |
|     ascii_letter         n  named-backref-3                         doContinueNamedBackRef
 | |
|     default                 errorDeath                              doBadNamedCapture
 | |
| 
 | |
| named-backref-3:
 | |
|     ascii_letter         n  named-backref-3                         doContinueNamedBackRef
 | |
|     digit_char           n  named-backref-3                         doContinueNamedBackRef
 | |
|     '>'                  n  expr-quant                              doCompleteNamedBackRef
 | |
|     default                 errorDeath                              doBadNamedCapture
 | |
| 
 | |
| 
 | |
| #
 | |
| # [set expression] parsing,
 | |
| #    All states involved in parsing set expressions have names beginning with "set-"
 | |
| #
 | |
| 
 | |
| set-open:
 | |
|    '^'                   n  set-open2                               doSetNegate
 | |
|    ':'                      set-posix                               doSetPosixProp
 | |
|    default                  set-open2
 | |
| 
 | |
| set-open2:
 | |
|    ']'                   n  set-after-lit                           doSetLiteral
 | |
|    default                  set-start
 | |
| 
 | |
| #  set-posix:
 | |
| #                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
 | |
| #                  moved the scan to the closing ']'.  If it wasn't a property
 | |
| #                  expression, the scan will still be at the opening ':', which should
 | |
| #                  be interpreted as a normal set expression.
 | |
| set-posix:
 | |
|     ']'                  n   pop                                    doSetEnd
 | |
|     ':'                      set-start
 | |
|     default                  errorDeath                             doRuleError  # should not be possible.
 | |
| 
 | |
| #
 | |
| #   set-start   after the [ and special case leading characters (^ and/or ]) but before
 | |
| #               everything else.   A '-' is literal at this point.
 | |
| #
 | |
| set-start:
 | |
|     ']'                  n  pop                                     doSetEnd
 | |
|     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 | |
|     '\'                  n  set-escape
 | |
|     '-'                  n  set-start-dash
 | |
|     '&'                  n  set-start-amp
 | |
|     default              n  set-after-lit                           doSetLiteral
 | |
| 
 | |
| #    set-start-dash    Turn "[--" into a syntax error.
 | |
| #                           "[-x" is good, - and x are literals.
 | |
| #
 | |
| set-start-dash:
 | |
|     '-'                     errorDeath                              doRuleError
 | |
|     default                 set-after-lit                           doSetAddDash
 | |
| 
 | |
| #    set-start-amp     Turn "[&&" into a syntax error.
 | |
| #                           "[&x" is good, & and x are literals.
 | |
| #
 | |
| set-start-amp:
 | |
|     '&'                     errorDeath                              doRuleError
 | |
|     default                 set-after-lit                           doSetAddAmp
 | |
| 
 | |
| #
 | |
| #   set-after-lit    The last thing scanned was a literal character within a set.
 | |
| #                    Can be followed by anything.  Single '-' or '&' are
 | |
| #                    literals in this context, not operators.
 | |
| set-after-lit:
 | |
|     ']'                  n  pop                                     doSetEnd
 | |
|     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 | |
|     '-'                  n  set-lit-dash
 | |
|     '&'                  n  set-lit-amp
 | |
|     '\'                  n  set-escape
 | |
|     eof                     errorDeath                              doSetNoCloseError
 | |
|     default              n  set-after-lit                           doSetLiteral
 | |
| 
 | |
| set-after-set:
 | |
|     ']'                  n  pop                                     doSetEnd
 | |
|     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 | |
|     '-'                  n  set-set-dash
 | |
|     '&'                  n  set-set-amp
 | |
|     '\'                  n  set-escape
 | |
|     eof                     errorDeath                              doSetNoCloseError
 | |
|     default              n  set-after-lit                           doSetLiteral
 | |
| 
 | |
| set-after-range:
 | |
|     ']'                  n  pop                                     doSetEnd
 | |
|     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 | |
|     '-'                  n  set-range-dash
 | |
|     '&'                  n  set-range-amp
 | |
|     '\'                  n  set-escape
 | |
|     eof                     errorDeath                              doSetNoCloseError
 | |
|     default              n  set-after-lit                           doSetLiteral
 | |
|     
 | |
| 
 | |
| # set-after-op
 | |
| #     After a --  or &&
 | |
| #     It is an error to close a set at this point.
 | |
| #
 | |
| set-after-op:
 | |
|     '['                  n  set-open         ^set-after-set         doSetBeginUnion
 | |
|     ']'                     errorDeath                              doSetOpError
 | |
|     '\'                  n  set-escape
 | |
|     default              n  set-after-lit                           doSetLiteral
 | |
| 
 | |
| #
 | |
| #   set-set-amp
 | |
| #      Have scanned [[set]&
 | |
| #      Could be a '&' intersection operator, if a set follows.
 | |
| #      Could be the start of a '&&' operator.
 | |
| #      Otherwise is a literal.
 | |
| set-set-amp:
 | |
|     '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
 | |
|     '&'                  n  set-after-op                           doSetIntersection2
 | |
|     default                 set-after-lit                          doSetAddAmp
 | |
| 
 | |
| 
 | |
| # set-lit-amp   Have scanned "[literals&"
 | |
| #               Could be a start of "&&" operator or a literal
 | |
| #               In [abc&[def]],   the '&' is a literal
 | |
| #
 | |
| set-lit-amp:
 | |
|     '&'                  n  set-after-op                            doSetIntersection2
 | |
|     default                 set-after-lit                           doSetAddAmp
 | |
| 
 | |
| 
 | |
| #
 | |
| #  set-set-dash
 | |
| #      Have scanned [set]-
 | |
| #      Could be a '-' difference operator, if a [set] follows.
 | |
| #      Could be the start of a '--' operator.
 | |
| #      Otherwise is a literal.
 | |
| set-set-dash:
 | |
|     '['                  n  set-open      ^set-after-set           doSetBeginDifference1
 | |
|     '-'                  n  set-after-op                           doSetDifference2
 | |
|     default                 set-after-lit                          doSetAddDash
 | |
| 
 | |
| 
 | |
| #
 | |
| #  set-range-dash
 | |
| #      scanned  a-b-  or \w-
 | |
| #         any set or range like item where the trailing single '-' should
 | |
| #         be literal, not a set difference operation.
 | |
| #         A trailing "--" is still a difference operator.
 | |
| set-range-dash:
 | |
|     '-'                  n  set-after-op                           doSetDifference2
 | |
|     default                 set-after-lit                          doSetAddDash
 | |
| 
 | |
| 
 | |
| set-range-amp:
 | |
|     '&'                  n  set-after-op                           doSetIntersection2
 | |
|     default                 set-after-lit                          doSetAddAmp
 | |
| 
 | |
| 
 | |
| #  set-lit-dash
 | |
| #     Have scanned "[literals-" Could be a range or a -- operator or a literal
 | |
| #     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
 | |
| #        [abc-\p{xx}  the '-' is an error
 | |
| #        [abc-]       the '-' is a literal
 | |
| #        [ab-xy]      the '-' is a range
 | |
| #
 | |
| set-lit-dash:
 | |
|     '-'                  n  set-after-op                            doSetDifference2
 | |
|     '['                     set-after-lit                           doSetAddDash
 | |
|     ']'                     set-after-lit                           doSetAddDash
 | |
|     '\'                  n  set-lit-dash-escape
 | |
|     default              n  set-after-range                         doSetRange
 | |
| 
 | |
| # set-lit-dash-escape
 | |
| #
 | |
| #    scanned "[literal-\"
 | |
| #    Could be a range, if the \ introduces an escaped literal char or a named char.
 | |
| #    Otherwise it is an error.
 | |
| #
 | |
| set-lit-dash-escape:
 | |
|    's'                      errorDeath                             doSetOpError
 | |
|    'S'                      errorDeath                             doSetOpError
 | |
|    'w'                      errorDeath                             doSetOpError
 | |
|    'W'                      errorDeath                             doSetOpError
 | |
|    'd'                      errorDeath                             doSetOpError
 | |
|    'D'                      errorDeath                             doSetOpError
 | |
|    'N'                      set-after-range                        doSetNamedRange
 | |
|    default               n  set-after-range                        doSetRange
 | |
| 
 | |
|    
 | |
| #
 | |
| #  set-escape
 | |
| #       Common back-slash escape processing within set expressions
 | |
| #
 | |
| set-escape:
 | |
|    'p'                      set-after-set                           doSetProp
 | |
|    'P'                      set-after-set                           doSetProp
 | |
|    'N'                      set-after-lit                           doSetNamedChar
 | |
|    's'                   n  set-after-range                         doSetBackslash_s
 | |
|    'S'                   n  set-after-range                         doSetBackslash_S
 | |
|    'w'                   n  set-after-range                         doSetBackslash_w
 | |
|    'W'                   n  set-after-range                         doSetBackslash_W
 | |
|    'd'                   n  set-after-range                         doSetBackslash_d
 | |
|    'D'                   n  set-after-range                         doSetBackslash_D
 | |
|    'h'                   n  set-after-range                         doSetBackslash_h
 | |
|    'H'                   n  set-after-range                         doSetBackslash_H
 | |
|    'v'                   n  set-after-range                         doSetBackslash_v
 | |
|    'V'                   n  set-after-range                         doSetBackslash_V
 | |
|    default               n  set-after-lit                           doSetLiteralEscaped 
 | |
| 
 | |
| #
 | |
| # set-finish
 | |
| #     Have just encountered the final ']' that completes a [set], and
 | |
| #     arrived here via a pop.  From here, we exit the set parsing world, and go
 | |
| #     back to generic regular expression parsing.
 | |
| #
 | |
| set-finish:
 | |
|     default                 expr-quant                              doSetFinish
 | |
| 
 | |
| 
 | |
| #
 | |
| # errorDeath.   This state is specified as the next state whenever a syntax error
 | |
| #               in the source rules is detected.  Barring bugs, the state machine will never
 | |
| #               actually get here, but will stop because of the action associated with the error.
 | |
| #               But, just in case, this state asks the state machine to exit.
 | |
| errorDeath:
 | |
|     default              n errorDeath                               doExit
 | |
| 
 | |
| 
 |