446 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			446 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2017 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_FORMATTING
 | |
| 
 | |
| #include "number_affixutils.h"
 | |
| #include "unicode/utf16.h"
 | |
| #include "unicode/uniset.h"
 | |
| 
 | |
| using namespace icu;
 | |
| using namespace icu::number;
 | |
| using namespace icu::number::impl;
 | |
| 
 | |
| TokenConsumer::~TokenConsumer() = default;
 | |
| SymbolProvider::~SymbolProvider() = default;
 | |
| 
 | |
| int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
 | |
|     AffixPatternState state = STATE_BASE;
 | |
|     int32_t offset = 0;
 | |
|     int32_t length = 0;
 | |
|     for (; offset < patternString.length();) {
 | |
|         UChar32 cp = patternString.char32At(offset);
 | |
| 
 | |
|         switch (state) {
 | |
|             case STATE_BASE:
 | |
|                 if (cp == u'\'') {
 | |
|                     // First quote
 | |
|                     state = STATE_FIRST_QUOTE;
 | |
|                 } else {
 | |
|                     // Unquoted symbol
 | |
|                     length++;
 | |
|                 }
 | |
|                 break;
 | |
|             case STATE_FIRST_QUOTE:
 | |
|                 if (cp == u'\'') {
 | |
|                     // Repeated quote
 | |
|                     length++;
 | |
|                     state = STATE_BASE;
 | |
|                 } else {
 | |
|                     // Quoted code point
 | |
|                     length++;
 | |
|                     state = STATE_INSIDE_QUOTE;
 | |
|                 }
 | |
|                 break;
 | |
|             case STATE_INSIDE_QUOTE:
 | |
|                 if (cp == u'\'') {
 | |
|                     // End of quoted sequence
 | |
|                     state = STATE_AFTER_QUOTE;
 | |
|                 } else {
 | |
|                     // Quoted code point
 | |
|                     length++;
 | |
|                 }
 | |
|                 break;
 | |
|             case STATE_AFTER_QUOTE:
 | |
|                 if (cp == u'\'') {
 | |
|                     // Double quote inside of quoted sequence
 | |
|                     length++;
 | |
|                     state = STATE_INSIDE_QUOTE;
 | |
|                 } else {
 | |
|                     // Unquoted symbol
 | |
|                     length++;
 | |
|                 }
 | |
|                 break;
 | |
|             default:
 | |
|                 UPRV_UNREACHABLE_EXIT;
 | |
|         }
 | |
| 
 | |
|         offset += U16_LENGTH(cp);
 | |
|     }
 | |
| 
 | |
|     switch (state) {
 | |
|         case STATE_FIRST_QUOTE:
 | |
|         case STATE_INSIDE_QUOTE:
 | |
|             status = U_ILLEGAL_ARGUMENT_ERROR;
 | |
|             break;
 | |
|         default:
 | |
|             break;
 | |
|     }
 | |
| 
 | |
|     return length;
 | |
| }
 | |
| 
 | |
| UnicodeString AffixUtils::escape(const UnicodeString &input) {
 | |
|     AffixPatternState state = STATE_BASE;
 | |
|     int32_t offset = 0;
 | |
|     UnicodeString output;
 | |
|     for (; offset < input.length();) {
 | |
|         UChar32 cp = input.char32At(offset);
 | |
| 
 | |
|         switch (cp) {
 | |
|             case u'\'':
 | |
|                 output.append(u"''", -1);
 | |
|                 break;
 | |
| 
 | |
|             case u'-':
 | |
|             case u'+':
 | |
|             case u'%':
 | |
|             case u'‰':
 | |
|             case u'¤':
 | |
|                 if (state == STATE_BASE) {
 | |
|                     output.append(u'\'');
 | |
|                     output.append(cp);
 | |
|                     state = STATE_INSIDE_QUOTE;
 | |
|                 } else {
 | |
|                     output.append(cp);
 | |
|                 }
 | |
|                 break;
 | |
| 
 | |
|             default:
 | |
|                 if (state == STATE_INSIDE_QUOTE) {
 | |
|                     output.append(u'\'');
 | |
|                     output.append(cp);
 | |
|                     state = STATE_BASE;
 | |
|                 } else {
 | |
|                     output.append(cp);
 | |
|                 }
 | |
|                 break;
 | |
|         }
 | |
|         offset += U16_LENGTH(cp);
 | |
|     }
 | |
| 
 | |
|     if (state == STATE_INSIDE_QUOTE) {
 | |
|         output.append(u'\'');
 | |
|     }
 | |
| 
 | |
|     return output;
 | |
| }
 | |
| 
 | |
| Field AffixUtils::getFieldForType(AffixPatternType type) {
 | |
|     switch (type) {
 | |
|         case TYPE_MINUS_SIGN:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
 | |
|         case TYPE_PLUS_SIGN:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
 | |
|         case TYPE_APPROXIMATELY_SIGN:
 | |
|             // TODO: Introduce a new field for the approximately sign?
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
 | |
|         case TYPE_PERCENT:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
 | |
|         case TYPE_PERMILLE:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
 | |
|         case TYPE_CURRENCY_SINGLE:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
 | |
|         case TYPE_CURRENCY_DOUBLE:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
 | |
|         case TYPE_CURRENCY_TRIPLE:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
 | |
|         case TYPE_CURRENCY_QUAD:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
 | |
|         case TYPE_CURRENCY_QUINT:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
 | |
|         case TYPE_CURRENCY_OVERFLOW:
 | |
|             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
 | |
|         default:
 | |
|             UPRV_UNREACHABLE_EXIT;
 | |
|     }
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
 | |
|                      const SymbolProvider &provider, Field field, UErrorCode &status) {
 | |
|     int32_t length = 0;
 | |
|     AffixTag tag;
 | |
|     while (hasNext(tag, affixPattern)) {
 | |
|         tag = nextToken(tag, affixPattern, status);
 | |
|         if (U_FAILURE(status)) { return length; }
 | |
|         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
 | |
|             // Don't go to the provider for this special case
 | |
|             length += output.insertCodePoint(
 | |
|                 position + length,
 | |
|                 0xFFFD,
 | |
|                 {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
 | |
|                 status);
 | |
|         } else if (tag.type < 0) {
 | |
|             length += output.insert(
 | |
|                     position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
 | |
|         } else {
 | |
|             length += output.insertCodePoint(position + length, tag.codePoint, field, status);
 | |
|         }
 | |
|     }
 | |
|     return length;
 | |
| }
 | |
| 
 | |
| int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
 | |
|                                             const SymbolProvider &provider, UErrorCode &status) {
 | |
|     int32_t length = 0;
 | |
|     AffixTag tag;
 | |
|     while (hasNext(tag, affixPattern)) {
 | |
|         tag = nextToken(tag, affixPattern, status);
 | |
|         if (U_FAILURE(status)) { return length; }
 | |
|         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
 | |
|             length += 1;
 | |
|         } else if (tag.type < 0) {
 | |
|             length += provider.getSymbol(tag.type).length();
 | |
|         } else {
 | |
|             length += U16_LENGTH(tag.codePoint);
 | |
|         }
 | |
|     }
 | |
|     return length;
 | |
| }
 | |
| 
 | |
| bool
 | |
| AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
 | |
|     if (affixPattern.length() == 0) {
 | |
|         return false;
 | |
|     }
 | |
|     AffixTag tag;
 | |
|     while (hasNext(tag, affixPattern)) {
 | |
|         tag = nextToken(tag, affixPattern, status);
 | |
|         if (U_FAILURE(status)) { return false; }
 | |
|         if (tag.type == type) {
 | |
|             return true;
 | |
|         }
 | |
|     }
 | |
|     return false;
 | |
| }
 | |
| 
 | |
| bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
 | |
|     if (affixPattern.length() == 0) {
 | |
|         return false;
 | |
|     }
 | |
|     AffixTag tag;
 | |
|     while (hasNext(tag, affixPattern)) {
 | |
|         tag = nextToken(tag, affixPattern, status);
 | |
|         if (U_FAILURE(status)) { return false; }
 | |
|         if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
 | |
|             return true;
 | |
|         }
 | |
|     }
 | |
|     return false;
 | |
| }
 | |
| 
 | |
| UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
 | |
|                                       char16_t replacementChar, UErrorCode &status) {
 | |
|     UnicodeString output(affixPattern); // copy
 | |
|     if (affixPattern.length() == 0) {
 | |
|         return output;
 | |
|     }
 | |
|     AffixTag tag;
 | |
|     while (hasNext(tag, affixPattern)) {
 | |
|         tag = nextToken(tag, affixPattern, status);
 | |
|         if (U_FAILURE(status)) { return output; }
 | |
|         if (tag.type == type) {
 | |
|             output.replace(tag.offset - 1, 1, replacementChar);
 | |
|         }
 | |
|     }
 | |
|     return output;
 | |
| }
 | |
| 
 | |
| bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
 | |
|                                                   const UnicodeSet& ignorables, UErrorCode& status) {
 | |
|     if (affixPattern.length() == 0) {
 | |
|         return true;
 | |
|     }
 | |
|     AffixTag tag;
 | |
|     while (hasNext(tag, affixPattern)) {
 | |
|         tag = nextToken(tag, affixPattern, status);
 | |
|         if (U_FAILURE(status)) { return false; }
 | |
|         if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
 | |
|             return false;
 | |
|         }
 | |
|     }
 | |
|     return true;
 | |
| }
 | |
| 
 | |
| void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
 | |
|                                      UErrorCode& status) {
 | |
|     if (affixPattern.length() == 0) {
 | |
|         return;
 | |
|     }
 | |
|     AffixTag tag;
 | |
|     while (hasNext(tag, affixPattern)) {
 | |
|         tag = nextToken(tag, affixPattern, status);
 | |
|         if (U_FAILURE(status)) { return; }
 | |
|         consumer.consumeToken(tag.type, tag.codePoint, status);
 | |
|         if (U_FAILURE(status)) { return; }
 | |
|     }
 | |
| }
 | |
| 
 | |
| AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
 | |
|     int32_t offset = tag.offset;
 | |
|     int32_t state = tag.state;
 | |
|     for (; offset < patternString.length();) {
 | |
|         UChar32 cp = patternString.char32At(offset);
 | |
|         int32_t count = U16_LENGTH(cp);
 | |
| 
 | |
|         switch (state) {
 | |
|             case STATE_BASE:
 | |
|                 switch (cp) {
 | |
|                     case u'\'':
 | |
|                         state = STATE_FIRST_QUOTE;
 | |
|                         offset += count;
 | |
|                         // continue to the next code point
 | |
|                         break;
 | |
|                     case u'-':
 | |
|                         return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
 | |
|                     case u'+':
 | |
|                         return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
 | |
|                     case u'~':
 | |
|                         return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
 | |
|                     case u'%':
 | |
|                         return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
 | |
|                     case u'‰':
 | |
|                         return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
 | |
|                     case u'¤':
 | |
|                         state = STATE_FIRST_CURR;
 | |
|                         offset += count;
 | |
|                         // continue to the next code point
 | |
|                         break;
 | |
|                     default:
 | |
|                         return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
 | |
|                 }
 | |
|                 break;
 | |
|             case STATE_FIRST_QUOTE:
 | |
|                 if (cp == u'\'') {
 | |
|                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
 | |
|                 } else {
 | |
|                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
 | |
|                 }
 | |
|             case STATE_INSIDE_QUOTE:
 | |
|                 if (cp == u'\'') {
 | |
|                     state = STATE_AFTER_QUOTE;
 | |
|                     offset += count;
 | |
|                     // continue to the next code point
 | |
|                     break;
 | |
|                 } else {
 | |
|                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
 | |
|                 }
 | |
|             case STATE_AFTER_QUOTE:
 | |
|                 if (cp == u'\'') {
 | |
|                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
 | |
|                 } else {
 | |
|                     state = STATE_BASE;
 | |
|                     // re-evaluate this code point
 | |
|                     break;
 | |
|                 }
 | |
|             case STATE_FIRST_CURR:
 | |
|                 if (cp == u'¤') {
 | |
|                     state = STATE_SECOND_CURR;
 | |
|                     offset += count;
 | |
|                     // continue to the next code point
 | |
|                     break;
 | |
|                 } else {
 | |
|                     return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
 | |
|                 }
 | |
|             case STATE_SECOND_CURR:
 | |
|                 if (cp == u'¤') {
 | |
|                     state = STATE_THIRD_CURR;
 | |
|                     offset += count;
 | |
|                     // continue to the next code point
 | |
|                     break;
 | |
|                 } else {
 | |
|                     return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
 | |
|                 }
 | |
|             case STATE_THIRD_CURR:
 | |
|                 if (cp == u'¤') {
 | |
|                     state = STATE_FOURTH_CURR;
 | |
|                     offset += count;
 | |
|                     // continue to the next code point
 | |
|                     break;
 | |
|                 } else {
 | |
|                     return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
 | |
|                 }
 | |
|             case STATE_FOURTH_CURR:
 | |
|                 if (cp == u'¤') {
 | |
|                     state = STATE_FIFTH_CURR;
 | |
|                     offset += count;
 | |
|                     // continue to the next code point
 | |
|                     break;
 | |
|                 } else {
 | |
|                     return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
 | |
|                 }
 | |
|             case STATE_FIFTH_CURR:
 | |
|                 if (cp == u'¤') {
 | |
|                     state = STATE_OVERFLOW_CURR;
 | |
|                     offset += count;
 | |
|                     // continue to the next code point
 | |
|                     break;
 | |
|                 } else {
 | |
|                     return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
 | |
|                 }
 | |
|             case STATE_OVERFLOW_CURR:
 | |
|                 if (cp == u'¤') {
 | |
|                     offset += count;
 | |
|                     // continue to the next code point and loop back to this state
 | |
|                     break;
 | |
|                 } else {
 | |
|                     return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
 | |
|                 }
 | |
|             default:
 | |
|                 UPRV_UNREACHABLE_EXIT;
 | |
|         }
 | |
|     }
 | |
|     // End of string
 | |
|     switch (state) {
 | |
|         case STATE_BASE:
 | |
|             // No more tokens in string.
 | |
|             return {-1};
 | |
|         case STATE_FIRST_QUOTE:
 | |
|         case STATE_INSIDE_QUOTE:
 | |
|             // For consistent behavior with the JDK and ICU 58, set an error here.
 | |
|             status = U_ILLEGAL_ARGUMENT_ERROR;
 | |
|             return {-1};
 | |
|         case STATE_AFTER_QUOTE:
 | |
|             // No more tokens in string.
 | |
|             return {-1};
 | |
|         case STATE_FIRST_CURR:
 | |
|             return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
 | |
|         case STATE_SECOND_CURR:
 | |
|             return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
 | |
|         case STATE_THIRD_CURR:
 | |
|             return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
 | |
|         case STATE_FOURTH_CURR:
 | |
|             return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
 | |
|         case STATE_FIFTH_CURR:
 | |
|             return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
 | |
|         case STATE_OVERFLOW_CURR:
 | |
|             return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
 | |
|         default:
 | |
|             UPRV_UNREACHABLE_EXIT;
 | |
|     }
 | |
| }
 | |
| 
 | |
| bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
 | |
|     // First check for the {-1} and default initializer syntax.
 | |
|     if (tag.offset < 0) {
 | |
|         return false;
 | |
|     } else if (tag.offset == 0) {
 | |
|         return string.length() > 0;
 | |
|     }
 | |
|     // The rest of the fields are safe to use now.
 | |
|     // Special case: the last character in string is an end quote.
 | |
|     if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
 | |
|         string.charAt(tag.offset) == u'\'') {
 | |
|         return false;
 | |
|     } else if (tag.state != STATE_BASE) {
 | |
|         return true;
 | |
|     } else {
 | |
|         return tag.offset < string.length();
 | |
|     }
 | |
| }
 | |
| 
 | |
| #endif /* #if !UCONFIG_NO_FORMATTING */
 |