297 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			297 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2016 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| /*
 | |
| **********************************************************************
 | |
| *   Copyright (c) 2001-2012, International Business Machines Corporation
 | |
| *   and others.  All Rights Reserved.
 | |
| **********************************************************************
 | |
| *   Date        Name        Description
 | |
| *   07/23/01    aliu        Creation.
 | |
| **********************************************************************
 | |
| */
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_TRANSLITERATION
 | |
| 
 | |
| #include "strmatch.h"
 | |
| #include "rbt_data.h"
 | |
| #include "util.h"
 | |
| #include "unicode/uniset.h"
 | |
| #include "unicode/utf16.h"
 | |
| 
 | |
| U_NAMESPACE_BEGIN
 | |
| 
 | |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
 | |
| 
 | |
| StringMatcher::StringMatcher(const UnicodeString& theString,
 | |
|                              int32_t start,
 | |
|                              int32_t limit,
 | |
|                              int32_t segmentNum,
 | |
|                              const TransliterationRuleData& theData) :
 | |
|     data(&theData),
 | |
|     segmentNumber(segmentNum),
 | |
|     matchStart(-1),
 | |
|     matchLimit(-1)
 | |
| {
 | |
|     theString.extractBetween(start, limit, pattern);
 | |
| }
 | |
| 
 | |
| StringMatcher::StringMatcher(const StringMatcher& o) :
 | |
|     UnicodeFunctor(o),
 | |
|     UnicodeMatcher(o),
 | |
|     UnicodeReplacer(o),
 | |
|     pattern(o.pattern),
 | |
|     data(o.data),
 | |
|     segmentNumber(o.segmentNumber),
 | |
|     matchStart(o.matchStart),
 | |
|     matchLimit(o.matchLimit)
 | |
| {
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Destructor
 | |
|  */
 | |
| StringMatcher::~StringMatcher() {
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Implement UnicodeFunctor
 | |
|  */
 | |
| StringMatcher* StringMatcher::clone() const {
 | |
|     return new StringMatcher(*this);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
 | |
|  * and return the pointer.
 | |
|  */
 | |
| UnicodeMatcher* StringMatcher::toMatcher() const {
 | |
|   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
 | |
|   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
 | |
|   
 | |
|   return nonconst_base;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
 | |
|  * and return the pointer.
 | |
|  */
 | |
| UnicodeReplacer* StringMatcher::toReplacer() const {
 | |
|   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
 | |
|   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
 | |
|   
 | |
|   return nonconst_base;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Implement UnicodeMatcher
 | |
|  */
 | |
| UMatchDegree StringMatcher::matches(const Replaceable& text,
 | |
|                                     int32_t& offset,
 | |
|                                     int32_t limit,
 | |
|                                     UBool incremental) {
 | |
|     int32_t i;
 | |
|     int32_t cursor = offset;
 | |
|     if (limit < cursor) {
 | |
|         // Match in the reverse direction
 | |
|         for (i=pattern.length()-1; i>=0; --i) {
 | |
|             UChar keyChar = pattern.charAt(i);
 | |
|             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
 | |
|             if (subm == 0) {
 | |
|                 if (cursor > limit &&
 | |
|                     keyChar == text.charAt(cursor)) {
 | |
|                     --cursor;
 | |
|                 } else {
 | |
|                     return U_MISMATCH;
 | |
|                 }
 | |
|             } else {
 | |
|                 UMatchDegree m =
 | |
|                     subm->matches(text, cursor, limit, incremental);
 | |
|                 if (m != U_MATCH) {
 | |
|                     return m;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         // Record the match position, but adjust for a normal
 | |
|         // forward start, limit, and only if a prior match does not
 | |
|         // exist -- we want the rightmost match.
 | |
|         if (matchStart < 0) {
 | |
|             matchStart = cursor+1;
 | |
|             matchLimit = offset+1;
 | |
|         }
 | |
|     } else {
 | |
|         for (i=0; i<pattern.length(); ++i) {
 | |
|             if (incremental && cursor == limit) {
 | |
|                 // We've reached the context limit without a mismatch and
 | |
|                 // without completing our match.
 | |
|                 return U_PARTIAL_MATCH;
 | |
|             }
 | |
|             UChar keyChar = pattern.charAt(i);
 | |
|             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
 | |
|             if (subm == 0) {
 | |
|                 // Don't need the cursor < limit check if
 | |
|                 // incremental is TRUE (because it's done above); do need
 | |
|                 // it otherwise.
 | |
|                 if (cursor < limit &&
 | |
|                     keyChar == text.charAt(cursor)) {
 | |
|                     ++cursor;
 | |
|                 } else {
 | |
|                     return U_MISMATCH;
 | |
|                 }
 | |
|             } else {
 | |
|                 UMatchDegree m =
 | |
|                     subm->matches(text, cursor, limit, incremental);
 | |
|                 if (m != U_MATCH) {
 | |
|                     return m;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         // Record the match position
 | |
|         matchStart = offset;
 | |
|         matchLimit = cursor;
 | |
|     }
 | |
| 
 | |
|     offset = cursor;
 | |
|     return U_MATCH;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Implement UnicodeMatcher
 | |
|  */
 | |
| UnicodeString& StringMatcher::toPattern(UnicodeString& result,
 | |
|                                         UBool escapeUnprintable) const
 | |
| {
 | |
|     result.truncate(0);
 | |
|     UnicodeString str, quoteBuf;
 | |
|     if (segmentNumber > 0) {
 | |
|         result.append((UChar)40); /*(*/
 | |
|     }
 | |
|     for (int32_t i=0; i<pattern.length(); ++i) {
 | |
|         UChar keyChar = pattern.charAt(i);
 | |
|         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
 | |
|         if (m == 0) {
 | |
|             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
 | |
|         } else {
 | |
|             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
 | |
|                          TRUE, escapeUnprintable, quoteBuf);
 | |
|         }
 | |
|     }
 | |
|     if (segmentNumber > 0) {
 | |
|         result.append((UChar)41); /*)*/
 | |
|     }
 | |
|     // Flush quoteBuf out to result
 | |
|     ICU_Utility::appendToRule(result, -1,
 | |
|                               TRUE, escapeUnprintable, quoteBuf);
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Implement UnicodeMatcher
 | |
|  */
 | |
| UBool StringMatcher::matchesIndexValue(uint8_t v) const {
 | |
|     if (pattern.length() == 0) {
 | |
|         return TRUE;
 | |
|     }
 | |
|     UChar32 c = pattern.char32At(0);
 | |
|     const UnicodeMatcher *m = data->lookupMatcher(c);
 | |
|     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Implement UnicodeMatcher
 | |
|  */
 | |
| void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
 | |
|     UChar32 ch;
 | |
|     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
 | |
|         ch = pattern.char32At(i);
 | |
|         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
 | |
|         if (matcher == NULL) {
 | |
|             toUnionTo.add(ch);
 | |
|         } else {
 | |
|             matcher->addMatchSetTo(toUnionTo);
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * UnicodeReplacer API
 | |
|  */
 | |
| int32_t StringMatcher::replace(Replaceable& text,
 | |
|                                int32_t start,
 | |
|                                int32_t limit,
 | |
|                                int32_t& /*cursor*/) {
 | |
|     
 | |
|     int32_t outLen = 0;
 | |
|     
 | |
|     // Copy segment with out-of-band data
 | |
|     int32_t dest = limit;
 | |
|     // If there was no match, that means that a quantifier
 | |
|     // matched zero-length.  E.g., x (a)* y matched "xy".
 | |
|     if (matchStart >= 0) {
 | |
|         if (matchStart != matchLimit) {
 | |
|             text.copy(matchStart, matchLimit, dest);
 | |
|             outLen = matchLimit - matchStart;
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
 | |
|     
 | |
|     return outLen;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * UnicodeReplacer API
 | |
|  */
 | |
| UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
 | |
|                                                 UBool /*escapeUnprintable*/) const {
 | |
|     // assert(segmentNumber > 0);
 | |
|     rule.truncate(0);
 | |
|     rule.append((UChar)0x0024 /*$*/);
 | |
|     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
 | |
|     return rule;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Remove any match info.  This must be called before performing a
 | |
|  * set of matches with this segment.
 | |
|  */
 | |
|  void StringMatcher::resetMatch() {
 | |
|     matchStart = matchLimit = -1;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Union the set of all characters that may output by this object
 | |
|  * into the given set.
 | |
|  * @param toUnionTo the set into which to union the output characters
 | |
|  */
 | |
| void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
 | |
|     // The output of this replacer varies; it is the source text between
 | |
|     // matchStart and matchLimit.  Since this varies depending on the
 | |
|     // input text, we can't compute it here.  We can either do nothing
 | |
|     // or we can add ALL characters to the set.  It's probably more useful
 | |
|     // to do nothing.
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Implement UnicodeFunctor
 | |
|  */
 | |
| void StringMatcher::setData(const TransliterationRuleData* d) {
 | |
|     data = d;
 | |
|     int32_t i = 0;
 | |
|     while (i<pattern.length()) {
 | |
|         UChar32 c = pattern.char32At(i);
 | |
|         UnicodeFunctor* f = data->lookup(c);
 | |
|         if (f != NULL) {
 | |
|             f->setData(data);
 | |
|         }
 | |
|         i += U16_LENGTH(c);
 | |
|     }
 | |
| }
 | |
| 
 | |
| U_NAMESPACE_END
 | |
| 
 | |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 | |
| 
 | |
| //eof
 |