294 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			294 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2016 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| /*
 | |
|  **********************************************************************
 | |
|  *   Copyright (c) 2001-2011, International Business Machines
 | |
|  *   Corporation and others.  All Rights Reserved.
 | |
|  **********************************************************************
 | |
|  *   Date        Name        Description
 | |
|  *   11/19/2001  aliu        Creation.
 | |
|  **********************************************************************
 | |
|  */
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_TRANSLITERATION
 | |
| 
 | |
| #include "unicode/uchar.h"
 | |
| #include "unicode/utf16.h"
 | |
| #include "unesctrn.h"
 | |
| #include "util.h"
 | |
| 
 | |
| #include "cmemory.h"
 | |
| 
 | |
| U_NAMESPACE_BEGIN
 | |
| 
 | |
| /**
 | |
|  * Special character marking the end of the spec[] array.
 | |
|  */
 | |
| static const UChar END = 0xFFFF;
 | |
| 
 | |
| // Unicode: "U+10FFFF" hex, min=4, max=6
 | |
| static const UChar SPEC_Unicode[] = {
 | |
|     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
 | |
|     END
 | |
| };
 | |
| 
 | |
| // Java: "\\uFFFF" hex, min=4, max=4
 | |
| static const UChar SPEC_Java[] = {
 | |
|     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
 | |
|     END
 | |
| };
 | |
| 
 | |
| // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
 | |
| static const UChar SPEC_C[] = {
 | |
|     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
 | |
|     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
 | |
|     END
 | |
| };
 | |
| 
 | |
| // XML: "" hex, min=1, max=6
 | |
| static const UChar SPEC_XML[] = {
 | |
|     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
 | |
|     END
 | |
| };
 | |
| 
 | |
| // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
 | |
| static const UChar SPEC_XML10[] = {
 | |
|     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
 | |
|     END
 | |
| };
 | |
| 
 | |
| // Perl: "\\x{263A}" hex, min=1, max=6
 | |
| static const UChar SPEC_Perl[] = {
 | |
|     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
 | |
|     END
 | |
| };
 | |
| 
 | |
| // All: Java, C, Perl, XML, XML10, Unicode
 | |
| static const UChar SPEC_Any[] = {
 | |
|     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
 | |
|     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
 | |
|     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
 | |
|     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
 | |
|     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
 | |
|     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
 | |
|     END
 | |
| };
 | |
| 
 | |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
 | |
| 
 | |
| static UChar* copySpec(const UChar* spec) {
 | |
|     int32_t len = 0;
 | |
|     while (spec[len] != END) {
 | |
|         ++len;
 | |
|     }
 | |
|     ++len;
 | |
|     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
 | |
|     // Check for memory allocation error. 
 | |
|     if (result != NULL) {
 | |
|     	uprv_memcpy(result, spec, (size_t)len*sizeof(result[0]));
 | |
|     }
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Factory methods.  Ignore the context.
 | |
|  */
 | |
| static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
 | |
|     return new UnescapeTransliterator(ID, SPEC_Unicode);
 | |
| }
 | |
| static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
 | |
|     return new UnescapeTransliterator(ID, SPEC_Java);
 | |
| }
 | |
| static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
 | |
|     return new UnescapeTransliterator(ID, SPEC_C);
 | |
| }
 | |
| static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
 | |
|     return new UnescapeTransliterator(ID, SPEC_XML);
 | |
| }
 | |
| static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
 | |
|     return new UnescapeTransliterator(ID, SPEC_XML10);
 | |
| }
 | |
| static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
 | |
|     return new UnescapeTransliterator(ID, SPEC_Perl);
 | |
| }
 | |
| static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
 | |
|     return new UnescapeTransliterator(ID, SPEC_Any);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Registers standard variants with the system.  Called by
 | |
|  * Transliterator during initialization.
 | |
|  */
 | |
| void UnescapeTransliterator::registerIDs() {
 | |
|     Token t = integerToken(0);
 | |
| 
 | |
|     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
 | |
| 
 | |
|     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
 | |
| 
 | |
|     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
 | |
| 
 | |
|     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
 | |
| 
 | |
|     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
 | |
| 
 | |
|     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
 | |
| 
 | |
|     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Constructor.  Takes the encoded spec array.
 | |
|  */
 | |
| UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
 | |
|                                                const UChar *newSpec) :
 | |
|     Transliterator(newID, NULL)
 | |
| {
 | |
|     this->spec = copySpec(newSpec);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Copy constructor.
 | |
|  */
 | |
| UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
 | |
|     Transliterator(o) {
 | |
|     this->spec = copySpec(o.spec);
 | |
| }
 | |
| 
 | |
| UnescapeTransliterator::~UnescapeTransliterator() {
 | |
|     uprv_free(spec);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Transliterator API.
 | |
|  */
 | |
| UnescapeTransliterator* UnescapeTransliterator::clone() const {
 | |
|     return new UnescapeTransliterator(*this);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Implements {@link Transliterator#handleTransliterate}.
 | |
|  */
 | |
| void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
 | |
|                                                  UBool isIncremental) const {
 | |
|     int32_t start = pos.start;
 | |
|     int32_t limit = pos.limit;
 | |
|     int32_t i, j, ipat;
 | |
| 
 | |
|     while (start < limit) {
 | |
|         // Loop over the forms in spec[].  Exit this loop when we
 | |
|         // match one of the specs.  Exit the outer loop if a
 | |
|         // partial match is detected and isIncremental is true.
 | |
|         for (j=0, ipat=0; spec[ipat] != END; ++j) {
 | |
| 
 | |
|             // Read the header
 | |
|             int32_t prefixLen = spec[ipat++];
 | |
|             int32_t suffixLen = spec[ipat++];
 | |
|             int8_t  radix     = (int8_t) spec[ipat++];
 | |
|             int32_t minDigits = spec[ipat++];
 | |
|             int32_t maxDigits = spec[ipat++];
 | |
| 
 | |
|             // s is a copy of start that is advanced over the
 | |
|             // characters as we parse them.
 | |
|             int32_t s = start;
 | |
|             UBool match = TRUE;
 | |
| 
 | |
|             for (i=0; i<prefixLen; ++i) {
 | |
|                 if (s >= limit) {
 | |
|                     if (i > 0) {
 | |
|                         // We've already matched a character.  This is
 | |
|                         // a partial match, so we return if in
 | |
|                         // incremental mode.  In non-incremental mode,
 | |
|                         // go to the next spec.
 | |
|                         if (isIncremental) {
 | |
|                             goto exit;
 | |
|                         }
 | |
|                         match = FALSE;
 | |
|                         break;
 | |
|                     }
 | |
|                 }
 | |
|                 UChar c = text.charAt(s++);
 | |
|                 if (c != spec[ipat + i]) {
 | |
|                     match = FALSE;
 | |
|                     break;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             if (match) {
 | |
|                 UChar32 u = 0;
 | |
|                 int32_t digitCount = 0;
 | |
|                 for (;;) {
 | |
|                     if (s >= limit) {
 | |
|                         // Check for partial match in incremental mode.
 | |
|                         if (s > start && isIncremental) {
 | |
|                             goto exit;
 | |
|                         }
 | |
|                         break;
 | |
|                     }
 | |
|                     UChar32 ch = text.char32At(s);
 | |
|                     int32_t digit = u_digit(ch, radix);
 | |
|                     if (digit < 0) {
 | |
|                         break;
 | |
|                     }
 | |
|                     s += U16_LENGTH(ch);
 | |
|                     u = (u * radix) + digit;
 | |
|                     if (++digitCount == maxDigits) {
 | |
|                         break;
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|                 match = (digitCount >= minDigits);
 | |
| 
 | |
|                 if (match) {
 | |
|                     for (i=0; i<suffixLen; ++i) {
 | |
|                         if (s >= limit) {
 | |
|                             // Check for partial match in incremental mode.
 | |
|                             if (s > start && isIncremental) {
 | |
|                                 goto exit;
 | |
|                             }
 | |
|                             match = FALSE;
 | |
|                             break;
 | |
|                         }
 | |
|                         UChar c = text.charAt(s++);
 | |
|                         if (c != spec[ipat + prefixLen + i]) {
 | |
|                             match = FALSE;
 | |
|                             break;
 | |
|                         }
 | |
|                     }
 | |
| 
 | |
|                     if (match) {
 | |
|                         // At this point, we have a match
 | |
|                         UnicodeString str(u);
 | |
|                         text.handleReplaceBetween(start, s, str);
 | |
|                         limit -= s - start - str.length();
 | |
|                         // The following break statement leaves the
 | |
|                         // loop that is traversing the forms in
 | |
|                         // spec[].  We then parse the next input
 | |
|                         // character.
 | |
|                         break;
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             ipat += prefixLen + suffixLen;
 | |
|         }
 | |
| 
 | |
|         if (start < limit) {
 | |
|             start += U16_LENGTH(text.char32At(start));
 | |
|         }
 | |
|     }
 | |
| 
 | |
|   exit:
 | |
|     pos.contextLimit += limit - pos.limit;
 | |
|     pos.limit = limit;
 | |
|     pos.start = start;
 | |
| }
 | |
| 
 | |
| U_NAMESPACE_END
 | |
| 
 | |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 | |
| 
 | |
| //eof
 |