253 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			253 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2016 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| /*
 | |
|  * Copyright (C) 2001-2011, International Business Machines Corporation
 | |
|  * and others. All Rights Reserved.
 | |
|  **********************************************************************
 | |
|  *   Date        Name        Description
 | |
|  *   07/23/01    aliu        Creation.
 | |
|  **********************************************************************
 | |
|  */
 | |
| #ifndef STRMATCH_H
 | |
| #define STRMATCH_H
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_TRANSLITERATION
 | |
| 
 | |
| #include "unicode/unistr.h"
 | |
| #include "unicode/unifunct.h"
 | |
| #include "unicode/unimatch.h"
 | |
| #include "unicode/unirepl.h"
 | |
| 
 | |
| U_NAMESPACE_BEGIN
 | |
| 
 | |
| class TransliterationRuleData;
 | |
| 
 | |
| /**
 | |
|  * An object that matches a fixed input string, implementing the
 | |
|  * UnicodeMatcher API.  This object also implements the
 | |
|  * UnicodeReplacer API, allowing it to emit the matched text as
 | |
|  * output.  Since the match text may contain flexible match elements,
 | |
|  * such as UnicodeSets, the emitted text is not the match pattern, but
 | |
|  * instead a substring of the actual matched text.  Following
 | |
|  * convention, the output text is the leftmost match seen up to this
 | |
|  * point.
 | |
|  *
 | |
|  * A StringMatcher may represent a segment, in which case it has a
 | |
|  * positive segment number.  This affects how the matcher converts
 | |
|  * itself to a pattern but does not otherwise affect its function.
 | |
|  *
 | |
|  * A StringMatcher that is not a segment should not be used as a
 | |
|  * UnicodeReplacer.
 | |
|  */
 | |
| class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
 | |
| 
 | |
|  public:
 | |
| 
 | |
|     /**
 | |
|      * Construct a matcher that matches the given pattern string.
 | |
|      * @param string the pattern to be matched, possibly containing
 | |
|      * stand-ins that represent nested UnicodeMatcher objects.
 | |
|      * @param start inclusive start index of text to be replaced
 | |
|      * @param limit exclusive end index of text to be replaced;
 | |
|      * must be greater than or equal to start
 | |
|      * @param segmentNum the segment number from 1..n, or 0 if this is
 | |
|      * not a segment.
 | |
|      * @param data context object mapping stand-ins to
 | |
|      * UnicodeMatcher objects.
 | |
|      */
 | |
|     StringMatcher(const UnicodeString& string,
 | |
|                   int32_t start,
 | |
|                   int32_t limit,
 | |
|                   int32_t segmentNum,
 | |
|                   const TransliterationRuleData& data);
 | |
| 
 | |
|     /**
 | |
|      * Copy constructor
 | |
|      * @param o  the object to be copied.
 | |
|      */
 | |
|     StringMatcher(const StringMatcher& o);
 | |
|         
 | |
|     /**
 | |
|      * Destructor
 | |
|      */
 | |
|     virtual ~StringMatcher();
 | |
| 
 | |
|     /**
 | |
|      * Implement UnicodeFunctor
 | |
|      * @return a copy of the object.
 | |
|      */
 | |
|     virtual StringMatcher* clone() const override;
 | |
| 
 | |
|     /**
 | |
|      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
 | |
|      * and return the pointer.
 | |
|      * @return the UnicodeMatcher point.
 | |
|      */
 | |
|     virtual UnicodeMatcher* toMatcher() const override;
 | |
| 
 | |
|     /**
 | |
|      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
 | |
|      * and return the pointer.
 | |
|      * @return the UnicodeReplacer pointer.
 | |
|      */
 | |
|     virtual UnicodeReplacer* toReplacer() const override;
 | |
| 
 | |
|     /**
 | |
|      * Implement UnicodeMatcher
 | |
|      * @param text the text to be matched
 | |
|      * @param offset on input, the index into text at which to begin
 | |
|      * matching.  On output, the limit of the matched text.  The
 | |
|      * number of matched characters is the output value of offset
 | |
|      * minus the input value.  Offset should always point to the
 | |
|      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
 | |
|      * both on entry and upon return.
 | |
|      * @param limit the limit index of text to be matched.  Greater
 | |
|      * than offset for a forward direction match, less than offset for
 | |
|      * a backward direction match.  The last character to be
 | |
|      * considered for matching will be text.charAt(limit-1) in the
 | |
|      * forward direction or text.charAt(limit+1) in the backward
 | |
|      * direction.
 | |
|      * @param incremental  if true, then assume further characters may
 | |
|      * be inserted at limit and check for partial matching.  Otherwise
 | |
|      * assume the text as given is complete.
 | |
|      * @return a match degree value indicating a full match, a partial
 | |
|      * match, or a mismatch.  If incremental is false then
 | |
|      * U_PARTIAL_MATCH should never be returned.
 | |
|      */
 | |
|     virtual UMatchDegree matches(const Replaceable& text,
 | |
|                                  int32_t& offset,
 | |
|                                  int32_t limit,
 | |
|                                  UBool incremental) override;
 | |
| 
 | |
|     /**
 | |
|      * Implement UnicodeMatcher
 | |
|      * @param result            Output param to receive the pattern.
 | |
|      * @param escapeUnprintable if True then escape the unprintable characters.
 | |
|      * @return                  A reference to 'result'.
 | |
|      */
 | |
|     virtual UnicodeString& toPattern(UnicodeString& result,
 | |
|                                      UBool escapeUnprintable = false) const override;
 | |
| 
 | |
|     /**
 | |
|      * Implement UnicodeMatcher
 | |
|      * Returns true if this matcher will match a character c, where c
 | |
|      * & 0xFF == v, at offset, in the forward direction (with limit >
 | |
|      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
 | |
|      * indexing.
 | |
|      * @param v    the given value
 | |
|      * @return     true if this matcher will match a character c, 
 | |
|      *             where c & 0xFF == v
 | |
|      */
 | |
|     virtual UBool matchesIndexValue(uint8_t v) const override;
 | |
| 
 | |
|     /**
 | |
|      * Implement UnicodeMatcher
 | |
|      */
 | |
|     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
 | |
| 
 | |
|     /**
 | |
|      * Implement UnicodeFunctor
 | |
|      */
 | |
|     virtual void setData(const TransliterationRuleData*) override;
 | |
| 
 | |
|     /**
 | |
|      * Replace characters in 'text' from 'start' to 'limit' with the
 | |
|      * output text of this object.  Update the 'cursor' parameter to
 | |
|      * give the cursor position and return the length of the
 | |
|      * replacement text.
 | |
|      *
 | |
|      * @param text the text to be matched
 | |
|      * @param start inclusive start index of text to be replaced
 | |
|      * @param limit exclusive end index of text to be replaced;
 | |
|      * must be greater than or equal to start
 | |
|      * @param cursor output parameter for the cursor position.
 | |
|      * Not all replacer objects will update this, but in a complete
 | |
|      * tree of replacer objects, representing the entire output side
 | |
|      * of a transliteration rule, at least one must update it.
 | |
|      * @return the number of 16-bit code units in the text replacing
 | |
|      * the characters at offsets start..(limit-1) in text
 | |
|      */
 | |
|     virtual int32_t replace(Replaceable& text,
 | |
|                             int32_t start,
 | |
|                             int32_t limit,
 | |
|                             int32_t& cursor) override;
 | |
| 
 | |
|     /**
 | |
|      * Returns a string representation of this replacer.  If the
 | |
|      * result of calling this function is passed to the appropriate
 | |
|      * parser, typically TransliteratorParser, it will produce another
 | |
|      * replacer that is equal to this one.
 | |
|      * @param result the string to receive the pattern.  Previous
 | |
|      * contents will be deleted.
 | |
|      * @param escapeUnprintable if true then convert unprintable
 | |
|      * character to their hex escape representations, \\uxxxx or
 | |
|      * \\Uxxxxxxxx.  Unprintable characters are defined by
 | |
|      * Utility.isUnprintable().
 | |
|      * @return a reference to 'result'.
 | |
|      */
 | |
|     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
 | |
|                                              UBool escapeUnprintable) const override;
 | |
| 
 | |
|     /**
 | |
|      * Remove any match data.  This must be called before performing a
 | |
|      * set of matches with this segment.
 | |
|      */
 | |
|     void resetMatch();
 | |
| 
 | |
|     /**
 | |
|      * ICU "poor man's RTTI", returns a UClassID for the actual class.
 | |
|      */
 | |
|     virtual UClassID getDynamicClassID() const override;
 | |
| 
 | |
|     /**
 | |
|      * ICU "poor man's RTTI", returns a UClassID for this class.
 | |
|      */
 | |
|     static UClassID U_EXPORT2 getStaticClassID();
 | |
| 
 | |
|     /**
 | |
|      * Union the set of all characters that may output by this object
 | |
|      * into the given set.
 | |
|      * @param toUnionTo the set into which to union the output characters
 | |
|      */
 | |
|     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override;
 | |
| 
 | |
|  private:
 | |
| 
 | |
|     /**
 | |
|      * The text to be matched.
 | |
|      */
 | |
|     UnicodeString pattern;
 | |
| 
 | |
|     /**
 | |
|      * Context object that maps stand-ins to matcher and replacer
 | |
|      * objects.
 | |
|      */
 | |
|     const TransliterationRuleData* data;
 | |
| 
 | |
|     /**
 | |
|      * The segment number, 1-based, or 0 if not a segment.
 | |
|      */
 | |
|     int32_t segmentNumber;
 | |
| 
 | |
|     /**
 | |
|      * Start offset, in the match text, of the <em>rightmost</em>
 | |
|      * match.
 | |
|      */
 | |
|     int32_t matchStart;
 | |
| 
 | |
|     /**
 | |
|      * Limit offset, in the match text, of the <em>rightmost</em>
 | |
|      * match.
 | |
|      */
 | |
|     int32_t matchLimit;
 | |
| 
 | |
| };
 | |
| 
 | |
| U_NAMESPACE_END
 | |
| 
 | |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 | |
| 
 | |
| #endif
 |