358 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			358 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2016 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| /*
 | |
| **********************************************************************
 | |
| * Copyright (C) 1999-2011, International Business Machines Corporation
 | |
| * and others. All Rights Reserved.
 | |
| **********************************************************************
 | |
| *   Date        Name        Description
 | |
| *   11/17/99    aliu        Creation.
 | |
| **********************************************************************
 | |
| */
 | |
| #ifndef RBT_PARS_H
 | |
| #define RBT_PARS_H
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_TRANSLITERATION
 | |
| #ifdef __cplusplus
 | |
| 
 | |
| #include "unicode/uobject.h"
 | |
| #include "unicode/parseerr.h"
 | |
| #include "unicode/unorm.h"
 | |
| #include "rbt.h"
 | |
| #include "hash.h"
 | |
| #include "uvector.h"
 | |
| 
 | |
| U_NAMESPACE_BEGIN
 | |
| 
 | |
| class TransliterationRuleData;
 | |
| class UnicodeFunctor;
 | |
| class ParseData;
 | |
| class RuleHalf;
 | |
| class ParsePosition;
 | |
| class StringMatcher;
 | |
| 
 | |
| class TransliteratorParser : public UMemory {
 | |
| 
 | |
|  public:
 | |
| 
 | |
|     /**
 | |
|      * A Vector of TransliterationRuleData objects, one for each discrete group
 | |
|      * of rules in the rule set
 | |
|      */
 | |
|     UVector dataVector;
 | |
| 
 | |
|     /**
 | |
|      * PUBLIC data member.
 | |
|      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
 | |
|      */
 | |
|     UVector idBlockVector;
 | |
| 
 | |
|     /**
 | |
|      * PUBLIC data member containing the parsed compound filter, if any.
 | |
|      */
 | |
|     UnicodeSet* compoundFilter;
 | |
| 
 | |
|  private:
 | |
| 
 | |
|     /**
 | |
|      * The current data object for which we are parsing rules
 | |
|      */
 | |
|     TransliterationRuleData* curData;
 | |
| 
 | |
|     UTransDirection direction;
 | |
| 
 | |
|     /**
 | |
|      * Parse error information.
 | |
|      */
 | |
|     UParseError parseError;
 | |
| 
 | |
|     /**
 | |
|      * Temporary symbol table used during parsing.
 | |
|      */
 | |
|     ParseData* parseData;
 | |
| 
 | |
|     /**
 | |
|      * Temporary vector of matcher variables.  When parsing is complete, this
 | |
|      * is copied into the array data.variables.  As with data.variables,
 | |
|      * element 0 corresponds to character data.variablesBase.
 | |
|      */
 | |
|     UVector variablesVector;
 | |
| 
 | |
|     /**
 | |
|      * Temporary table of variable names.  When parsing is complete, this is
 | |
|      * copied into data.variableNames.
 | |
|      */
 | |
|     Hashtable variableNames;    
 | |
|     
 | |
|     /**
 | |
|      * String of standins for segments.  Used during the parsing of a single
 | |
|      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
 | |
|      * to StringMatcher object segmentObjects.elementAt(0), etc.
 | |
|      */
 | |
|     UnicodeString segmentStandins;
 | |
| 
 | |
|     /**
 | |
|      * Vector of StringMatcher objects for segments.  Used during the
 | |
|      * parsing of a single rule.  
 | |
|      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
 | |
|      * to StringMatcher object segmentObjects.elementAt(0), etc.
 | |
|      */
 | |
|     UVector segmentObjects;
 | |
| 
 | |
|     /**
 | |
|      * The next available stand-in for variables.  This starts at some point in
 | |
|      * the private use area (discovered dynamically) and increments up toward
 | |
|      * <code>variableLimit</code>.  At any point during parsing, available
 | |
|      * variables are <code>variableNext..variableLimit-1</code>.
 | |
|      */
 | |
|     UChar variableNext;
 | |
| 
 | |
|     /**
 | |
|      * The last available stand-in for variables.  This is discovered
 | |
|      * dynamically.  At any point during parsing, available variables are
 | |
|      * <code>variableNext..variableLimit-1</code>.
 | |
|      */
 | |
|     UChar variableLimit;
 | |
| 
 | |
|     /**
 | |
|      * When we encounter an undefined variable, we do not immediately signal
 | |
|      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
 | |
|      * Instead, we save the name of the undefined variable, and substitute
 | |
|      * in the placeholder char variableLimit - 1, and decrement
 | |
|      * variableLimit.
 | |
|      */
 | |
|     UnicodeString undefinedVariableName;
 | |
| 
 | |
|     /**
 | |
|      * The stand-in character for the 'dot' set, represented by '.' in
 | |
|      * patterns.  This is allocated the first time it is needed, and
 | |
|      * reused thereafter.
 | |
|      */
 | |
|     UChar dotStandIn;
 | |
| 
 | |
| public:
 | |
| 
 | |
|     /**
 | |
|      * Constructor.
 | |
|      */
 | |
|     TransliteratorParser(UErrorCode &statusReturn);
 | |
| 
 | |
|     /**
 | |
|      * Destructor.
 | |
|      */
 | |
|     ~TransliteratorParser();
 | |
| 
 | |
|     /**
 | |
|      * Parse the given string as a sequence of rules, separated by newline
 | |
|      * characters ('\n'), and cause this object to implement those rules.  Any
 | |
|      * previous rules are discarded.  Typically this method is called exactly
 | |
|      * once after construction.
 | |
|      *
 | |
|      * Parse the given rules, in the given direction.  After this call
 | |
|      * returns, query the public data members for results.  The caller
 | |
|      * owns the 'data' and 'compoundFilter' data members after this
 | |
|      * call returns.
 | |
|      * @param rules      rules, separated by ';'
 | |
|      * @param direction  either FORWARD or REVERSE.
 | |
|      * @param pe         Struct to receive information on position 
 | |
|      *                   of error if an error is encountered
 | |
|      * @param ec         Output param set to success/failure code.
 | |
|      */
 | |
|     void parse(const UnicodeString& rules,
 | |
|                UTransDirection direction,
 | |
|                UParseError& pe,
 | |
|                UErrorCode& ec);
 | |
| 
 | |
|     /**
 | |
|      * Return the compound filter parsed by parse().  Caller owns result.
 | |
|      * @return the compound filter parsed by parse().
 | |
|      */ 
 | |
|     UnicodeSet* orphanCompoundFilter();
 | |
| 
 | |
| private:
 | |
| 
 | |
|     /**
 | |
|      * Return a representation of this transliterator as source rules.
 | |
|      * @param rules      Output param to receive the rules.
 | |
|      * @param direction  either FORWARD or REVERSE.
 | |
|      */
 | |
|     void parseRules(const UnicodeString& rules,
 | |
|                     UTransDirection direction,
 | |
|                     UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * MAIN PARSER.  Parse the next rule in the given rule string, starting
 | |
|      * at pos.  Return the index after the last character parsed.  Do not
 | |
|      * parse characters at or after limit.
 | |
|      *
 | |
|      * Important:  The character at pos must be a non-whitespace character
 | |
|      * that is not the comment character.
 | |
|      *
 | |
|      * This method handles quoting, escaping, and whitespace removal.  It
 | |
|      * parses the end-of-rule character.  It recognizes context and cursor
 | |
|      * indicators.  Once it does a lexical breakdown of the rule at pos, it
 | |
|      * creates a rule object and adds it to our rule list.
 | |
|      * @param rules      Output param to receive the rules.
 | |
|      * @param pos        the starting position.
 | |
|      * @param limit      pointer past the last character of the rule.
 | |
|      * @return           the index after the last character parsed.
 | |
|      */
 | |
|     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Set the variable range to [start, end] (inclusive).
 | |
|      * @param start    the start value of the range.
 | |
|      * @param end      the end value of the range.
 | |
|      */
 | |
|     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Assert that the given character is NOT within the variable range.
 | |
|      * If it is, return false.  This is necessary to ensure that the
 | |
|      * variable range does not overlap characters used in a rule.
 | |
|      * @param ch     the given character.
 | |
|      * @return       True, if the given character is NOT within the variable range.
 | |
|      */
 | |
|     UBool checkVariableRange(UChar32 ch) const;
 | |
| 
 | |
|     /**
 | |
|      * Set the maximum backup to 'backup', in response to a pragma
 | |
|      * statement.
 | |
|      * @param backup    the new value to be set.
 | |
|      */
 | |
|     void pragmaMaximumBackup(int32_t backup);
 | |
| 
 | |
|     /**
 | |
|      * Begin normalizing all rules using the given mode, in response
 | |
|      * to a pragma statement.
 | |
|      * @param mode    the given mode.
 | |
|      */
 | |
|     void pragmaNormalizeRules(UNormalizationMode mode);
 | |
| 
 | |
|     /**
 | |
|      * Return true if the given rule looks like a pragma.
 | |
|      * @param pos offset to the first non-whitespace character
 | |
|      * of the rule.
 | |
|      * @param limit pointer past the last character of the rule.
 | |
|      * @return true if the given rule looks like a pragma.
 | |
|      */
 | |
|     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
 | |
| 
 | |
|     /**
 | |
|      * Parse a pragma.  This method assumes resemblesPragma() has
 | |
|      * already returned true.
 | |
|      * @param pos offset to the first non-whitespace character
 | |
|      * of the rule.
 | |
|      * @param limit pointer past the last character of the rule.
 | |
|      * @return the position index after the final ';' of the pragma,
 | |
|      * or -1 on failure.
 | |
|      */
 | |
|     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Called by main parser upon syntax error.  Search the rule string
 | |
|      * for the probable end of the rule.  Of course, if the error is that
 | |
|      * the end of rule marker is missing, then the rule end will not be found.
 | |
|      * In any case the rule start will be correctly reported.
 | |
|      * @param parseErrorCode error code.
 | |
|      * @param msg error description.
 | |
|      * @param start position of first character of current rule.
 | |
|      * @return start position of first character of current rule.
 | |
|      */
 | |
|     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
 | |
|                         UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Parse a UnicodeSet out, store it, and return the stand-in character
 | |
|      * used to represent it.
 | |
|      *
 | |
|      * @param rule    the rule for UnicodeSet.
 | |
|      * @param pos     the position in pattern at which to start parsing.
 | |
|      * @return        the stand-in character used to represent it.
 | |
|      */
 | |
|     UChar parseSet(const UnicodeString& rule,
 | |
|                    ParsePosition& pos,
 | |
|                    UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Generate and return a stand-in for a new UnicodeFunctor.  Store
 | |
|      * the matcher (adopt it).
 | |
|      * @param adopted the UnicodeFunctor to be adopted.
 | |
|      * @return        a stand-in for a new UnicodeFunctor.
 | |
|      */
 | |
|     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Return the standin for segment seg (1-based).
 | |
|      * @param seg    the given segment.
 | |
|      * @return       the standIn character for the given segment.
 | |
|      */
 | |
|     UChar getSegmentStandin(int32_t seg, UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Set the object for segment seg (1-based).
 | |
|      * @param seg      the given segment.
 | |
|      * @param adopted  the StringMatcher to be adopted.
 | |
|      */
 | |
|     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Return the stand-in for the dot set.  It is allocated the first
 | |
|      * time and reused thereafter.
 | |
|      * @return    the stand-in for the dot set.
 | |
|      */
 | |
|     UChar getDotStandIn(UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Append the value of the given variable name to the given
 | |
|      * UnicodeString.
 | |
|      * @param name    the variable name to be appended.
 | |
|      * @param buf     the given UnicodeString to append to.
 | |
|      */
 | |
|     void appendVariableDef(const UnicodeString& name,
 | |
|                            UnicodeString& buf,
 | |
|                            UErrorCode& status);
 | |
| 
 | |
|     /**
 | |
|      * Glue method to get around access restrictions in C++.
 | |
|      */
 | |
|     /*static Transliterator* createBasicInstance(const UnicodeString& id,
 | |
|                                                const UnicodeString* canonID);*/
 | |
| 
 | |
|     friend class RuleHalf;
 | |
| 
 | |
|     // Disallowed methods; no impl.
 | |
|     /**
 | |
|      * Copy constructor
 | |
|      */
 | |
|     TransliteratorParser(const TransliteratorParser&);
 | |
|     
 | |
|     /**
 | |
|      * Assignment operator
 | |
|      */
 | |
|     TransliteratorParser& operator=(const TransliteratorParser&);
 | |
| };
 | |
| 
 | |
| U_NAMESPACE_END
 | |
| 
 | |
| #endif /* #ifdef __cplusplus */
 | |
| 
 | |
| /**
 | |
|  * Strip/convert the following from the transliterator rules:
 | |
|  * comments
 | |
|  * newlines
 | |
|  * white space at the beginning and end of a line
 | |
|  * unescape \u notation
 | |
|  *
 | |
|  * The target must be equal in size as the source.
 | |
|  * @internal
 | |
|  */
 | |
| U_CAPI int32_t
 | |
| utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
 | |
| 
 | |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 | |
| 
 | |
| #endif
 |