460 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			460 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2018 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_FORMATTING
 | |
| 
 | |
| // Allow implicit conversion from char16_t* to UnicodeString for this file:
 | |
| // Helpful in toString methods and elsewhere.
 | |
| #define UNISTR_FROM_STRING_EXPLICIT
 | |
| 
 | |
| #include "numparse_types.h"
 | |
| #include "numparse_decimal.h"
 | |
| #include "static_unicode_sets.h"
 | |
| #include "numparse_utils.h"
 | |
| #include "unicode/uchar.h"
 | |
| #include "putilimp.h"
 | |
| #include "number_decimalquantity.h"
 | |
| #include "string_segment.h"
 | |
| 
 | |
| using namespace icu;
 | |
| using namespace icu::numparse;
 | |
| using namespace icu::numparse::impl;
 | |
| 
 | |
| 
 | |
| DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
 | |
|                                parse_flags_t parseFlags) {
 | |
|     if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
 | |
|         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
 | |
|         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
 | |
|     } else {
 | |
|         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
 | |
|         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
 | |
|     }
 | |
|     bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
 | |
|     unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
 | |
|                                                 : unisets::ALL_SEPARATORS;
 | |
| 
 | |
|     // Attempt to find separators in the static cache
 | |
| 
 | |
|     groupingUniSet = unisets::get(groupingKey);
 | |
|     unisets::Key decimalKey = unisets::chooseFrom(
 | |
|             decimalSeparator,
 | |
|             strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
 | |
|             strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
 | |
|     if (decimalKey >= 0) {
 | |
|         decimalUniSet = unisets::get(decimalKey);
 | |
|     } else if (!decimalSeparator.isEmpty()) {
 | |
|         auto* set = new UnicodeSet();
 | |
|         set->add(decimalSeparator.char32At(0));
 | |
|         set->freeze();
 | |
|         decimalUniSet = set;
 | |
|         fLocalDecimalUniSet.adoptInstead(set);
 | |
|     } else {
 | |
|         decimalUniSet = unisets::get(unisets::EMPTY);
 | |
|     }
 | |
| 
 | |
|     if (groupingKey >= 0 && decimalKey >= 0) {
 | |
|         // Everything is available in the static cache
 | |
|         separatorSet = groupingUniSet;
 | |
|         leadSet = unisets::get(
 | |
|                 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
 | |
|                                  : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
 | |
|     } else {
 | |
|         auto* set = new UnicodeSet();
 | |
|         set->addAll(*groupingUniSet);
 | |
|         set->addAll(*decimalUniSet);
 | |
|         set->freeze();
 | |
|         separatorSet = set;
 | |
|         fLocalSeparatorSet.adoptInstead(set);
 | |
|         leadSet = nullptr;
 | |
|     }
 | |
| 
 | |
|     UChar32 cpZero = symbols.getCodePointZero();
 | |
|     if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
 | |
|         // Uncommon case: okay to allocate.
 | |
|         auto digitStrings = new UnicodeString[10];
 | |
|         fLocalDigitStrings.adoptInstead(digitStrings);
 | |
|         for (int32_t i = 0; i <= 9; i++) {
 | |
|             digitStrings[i] = symbols.getConstDigitSymbol(i);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
 | |
|     groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
 | |
|     integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
 | |
|     grouping1 = grouper.getPrimary();
 | |
|     grouping2 = grouper.getSecondary();
 | |
| 
 | |
|     // Fraction grouping parsing is disabled for now but could be enabled later.
 | |
|     // See https://unicode-org.atlassian.net/browse/ICU-10794
 | |
|     // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
 | |
| }
 | |
| 
 | |
| bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
 | |
|     return match(segment, result, 0, status);
 | |
| }
 | |
| 
 | |
| bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
 | |
|                            UErrorCode&) const {
 | |
|     if (result.seenNumber() && exponentSign == 0) {
 | |
|         // A number has already been consumed.
 | |
|         return false;
 | |
|     } else if (exponentSign != 0) {
 | |
|         // scientific notation always comes after the number
 | |
|         U_ASSERT(!result.quantity.bogus);
 | |
|     }
 | |
| 
 | |
|     // Initial offset before any character consumption.
 | |
|     int32_t initialOffset = segment.getOffset();
 | |
| 
 | |
|     // Return value: whether to ask for more characters.
 | |
|     bool maybeMore = false;
 | |
| 
 | |
|     // All digits consumed so far.
 | |
|     number::impl::DecimalQuantity digitsConsumed;
 | |
|     digitsConsumed.bogus = true;
 | |
| 
 | |
|     // The total number of digits after the decimal place, used for scaling the result.
 | |
|     int32_t digitsAfterDecimalPlace = 0;
 | |
| 
 | |
|     // The actual grouping and decimal separators used in the string.
 | |
|     // If non-null, we have seen that token.
 | |
|     UnicodeString actualGroupingString;
 | |
|     UnicodeString actualDecimalString;
 | |
|     actualGroupingString.setToBogus();
 | |
|     actualDecimalString.setToBogus();
 | |
| 
 | |
|     // Information for two groups: the previous group and the current group.
 | |
|     //
 | |
|     // Each group has three pieces of information:
 | |
|     //
 | |
|     // Offset: the string position of the beginning of the group, including a leading separator
 | |
|     // if there was a leading separator. This is needed in case we need to rewind the parse to
 | |
|     // that position.
 | |
|     //
 | |
|     // Separator type:
 | |
|     // 0 => beginning of string
 | |
|     // 1 => lead separator is a grouping separator
 | |
|     // 2 => lead separator is a decimal separator
 | |
|     //
 | |
|     // Count: the number of digits in the group. If -1, the group has been validated.
 | |
|     int32_t currGroupOffset = 0;
 | |
|     int32_t currGroupSepType = 0;
 | |
|     int32_t currGroupCount = 0;
 | |
|     int32_t prevGroupOffset = -1;
 | |
|     int32_t prevGroupSepType = -1;
 | |
|     int32_t prevGroupCount = -1;
 | |
| 
 | |
|     while (segment.length() > 0) {
 | |
|         maybeMore = false;
 | |
| 
 | |
|         // Attempt to match a digit.
 | |
|         int8_t digit = -1;
 | |
| 
 | |
|         // Try by code point digit value.
 | |
|         UChar32 cp = segment.getCodePoint();
 | |
|         if (u_isdigit(cp)) {
 | |
|             segment.adjustOffset(U16_LENGTH(cp));
 | |
|             digit = static_cast<int8_t>(u_digit(cp, 10));
 | |
|         }
 | |
| 
 | |
|         // Try by digit string.
 | |
|         if (digit == -1 && !fLocalDigitStrings.isNull()) {
 | |
|             for (int32_t i = 0; i < 10; i++) {
 | |
|                 const UnicodeString& str = fLocalDigitStrings[i];
 | |
|                 if (str.isEmpty()) {
 | |
|                     continue;
 | |
|                 }
 | |
|                 int32_t overlap = segment.getCommonPrefixLength(str);
 | |
|                 if (overlap == str.length()) {
 | |
|                     segment.adjustOffset(overlap);
 | |
|                     digit = static_cast<int8_t>(i);
 | |
|                     break;
 | |
|                 }
 | |
|                 maybeMore = maybeMore || (overlap == segment.length());
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         if (digit >= 0) {
 | |
|             // Digit was found.
 | |
|             if (digitsConsumed.bogus) {
 | |
|                 digitsConsumed.bogus = false;
 | |
|                 digitsConsumed.clear();
 | |
|             }
 | |
|             digitsConsumed.appendDigit(digit, 0, true);
 | |
|             currGroupCount++;
 | |
|             if (!actualDecimalString.isBogus()) {
 | |
|                 digitsAfterDecimalPlace++;
 | |
|             }
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         // Attempt to match a literal grouping or decimal separator.
 | |
|         bool isDecimal = false;
 | |
|         bool isGrouping = false;
 | |
| 
 | |
|         // 1) Attempt the decimal separator string literal.
 | |
|         // if (we have not seen a decimal separator yet) { ... }
 | |
|         if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
 | |
|             int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
 | |
|             maybeMore = maybeMore || (overlap == segment.length());
 | |
|             if (overlap == decimalSeparator.length()) {
 | |
|                 isDecimal = true;
 | |
|                 actualDecimalString = decimalSeparator;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // 2) Attempt to match the actual grouping string literal.
 | |
|         if (!actualGroupingString.isBogus()) {
 | |
|             int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
 | |
|             maybeMore = maybeMore || (overlap == segment.length());
 | |
|             if (overlap == actualGroupingString.length()) {
 | |
|                 isGrouping = true;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // 2.5) Attempt to match a new the grouping separator string literal.
 | |
|         // if (we have not seen a grouping or decimal separator yet) { ... }
 | |
|         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
 | |
|             !groupingSeparator.isEmpty()) {
 | |
|             int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
 | |
|             maybeMore = maybeMore || (overlap == segment.length());
 | |
|             if (overlap == groupingSeparator.length()) {
 | |
|                 isGrouping = true;
 | |
|                 actualGroupingString = groupingSeparator;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // 3) Attempt to match a decimal separator from the equivalence set.
 | |
|         // if (we have not seen a decimal separator yet) { ... }
 | |
|         // The !isGrouping is to confirm that we haven't yet matched the current character.
 | |
|         if (!isGrouping && actualDecimalString.isBogus()) {
 | |
|             if (decimalUniSet->contains(cp)) {
 | |
|                 isDecimal = true;
 | |
|                 actualDecimalString = UnicodeString(cp);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // 4) Attempt to match a grouping separator from the equivalence set.
 | |
|         // if (we have not seen a grouping or decimal separator yet) { ... }
 | |
|         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
 | |
|             if (groupingUniSet->contains(cp)) {
 | |
|                 isGrouping = true;
 | |
|                 actualGroupingString = UnicodeString(cp);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Leave if we failed to match this as a separator.
 | |
|         if (!isDecimal && !isGrouping) {
 | |
|             break;
 | |
|         }
 | |
| 
 | |
|         // Check for conditions when we don't want to accept the separator.
 | |
|         if (isDecimal && integerOnly) {
 | |
|             break;
 | |
|         } else if (currGroupSepType == 2 && isGrouping) {
 | |
|             // Fraction grouping
 | |
|             break;
 | |
|         }
 | |
| 
 | |
|         // Validate intermediate grouping sizes.
 | |
|         bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
 | |
|         bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
 | |
|         if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
 | |
|             // Invalid grouping sizes.
 | |
|             if (isGrouping && currGroupCount == 0) {
 | |
|                 // Trailing grouping separators: these are taken care of below
 | |
|                 U_ASSERT(currGroupSepType == 1);
 | |
|             } else if (requireGroupingMatch) {
 | |
|                 // Strict mode: reject the parse
 | |
|                 digitsConsumed.clear();
 | |
|                 digitsConsumed.bogus = true;
 | |
|             }
 | |
|             break;
 | |
|         } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
 | |
|             break;
 | |
|         } else {
 | |
|             // Grouping sizes OK so far.
 | |
|             prevGroupOffset = currGroupOffset;
 | |
|             prevGroupCount = currGroupCount;
 | |
|             if (isDecimal) {
 | |
|                 // Do not validate this group any more.
 | |
|                 prevGroupSepType = -1;
 | |
|             } else {
 | |
|                 prevGroupSepType = currGroupSepType;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // OK to accept the separator.
 | |
|         // Special case: don't update currGroup if it is empty; this allows two grouping
 | |
|         // separators in a row in lenient mode.
 | |
|         if (currGroupCount != 0) {
 | |
|             currGroupOffset = segment.getOffset();
 | |
|         }
 | |
|         currGroupSepType = isGrouping ? 1 : 2;
 | |
|         currGroupCount = 0;
 | |
|         if (isGrouping) {
 | |
|             segment.adjustOffset(actualGroupingString.length());
 | |
|         } else {
 | |
|             segment.adjustOffset(actualDecimalString.length());
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     // End of main loop.
 | |
|     // Back up if there was a trailing grouping separator.
 | |
|     // Shift prev -> curr so we can check it as a final group.
 | |
|     if (currGroupSepType != 2 && currGroupCount == 0) {
 | |
|         maybeMore = true;
 | |
|         segment.setOffset(currGroupOffset);
 | |
|         currGroupOffset = prevGroupOffset;
 | |
|         currGroupSepType = prevGroupSepType;
 | |
|         currGroupCount = prevGroupCount;
 | |
|         prevGroupOffset = -1;
 | |
|         prevGroupSepType = 0;
 | |
|         prevGroupCount = 1;
 | |
|     }
 | |
| 
 | |
|     // Validate final grouping sizes.
 | |
|     bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
 | |
|     bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
 | |
|     if (!requireGroupingMatch) {
 | |
|         // The cases we need to handle here are lone digits.
 | |
|         // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
 | |
|         // See more examples in numberformattestspecification.txt
 | |
|         int32_t digitsToRemove = 0;
 | |
|         if (!prevValidSecondary) {
 | |
|             segment.setOffset(prevGroupOffset);
 | |
|             digitsToRemove += prevGroupCount;
 | |
|             digitsToRemove += currGroupCount;
 | |
|         } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
 | |
|             maybeMore = true;
 | |
|             segment.setOffset(currGroupOffset);
 | |
|             digitsToRemove += currGroupCount;
 | |
|         }
 | |
|         if (digitsToRemove != 0) {
 | |
|             digitsConsumed.adjustMagnitude(-digitsToRemove);
 | |
|             digitsConsumed.truncate();
 | |
|         }
 | |
|         prevValidSecondary = true;
 | |
|         currValidPrimary = true;
 | |
|     }
 | |
|     if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
 | |
|         // Grouping failure.
 | |
|         digitsConsumed.bogus = true;
 | |
|     }
 | |
| 
 | |
|     // Strings that start with a separator but have no digits,
 | |
|     // or strings that failed a grouping size check.
 | |
|     if (digitsConsumed.bogus) {
 | |
|         maybeMore = maybeMore || (segment.length() == 0);
 | |
|         segment.setOffset(initialOffset);
 | |
|         return maybeMore;
 | |
|     }
 | |
| 
 | |
|     // We passed all inspections. Start post-processing.
 | |
| 
 | |
|     // Adjust for fraction part.
 | |
|     digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
 | |
| 
 | |
|     // Set the digits, either normal or exponent.
 | |
|     if (exponentSign != 0 && segment.getOffset() != initialOffset) {
 | |
|         bool overflow = false;
 | |
|         if (digitsConsumed.fitsInLong()) {
 | |
|             int64_t exponentLong = digitsConsumed.toLong(false);
 | |
|             U_ASSERT(exponentLong >= 0);
 | |
|             if (exponentLong <= INT32_MAX) {
 | |
|                 auto exponentInt = static_cast<int32_t>(exponentLong);
 | |
|                 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
 | |
|                     overflow = true;
 | |
|                 }
 | |
|             } else {
 | |
|                 overflow = true;
 | |
|             }
 | |
|         } else {
 | |
|             overflow = true;
 | |
|         }
 | |
|         if (overflow) {
 | |
|             if (exponentSign == -1) {
 | |
|                 // Set to zero
 | |
|                 result.quantity.clear();
 | |
|             } else {
 | |
|                 // Set to infinity
 | |
|                 result.quantity.bogus = true;
 | |
|                 result.flags |= FLAG_INFINITY;
 | |
|             }
 | |
|         }
 | |
|     } else {
 | |
|         result.quantity = digitsConsumed;
 | |
|     }
 | |
| 
 | |
|     // Set other information into the result and return.
 | |
|     if (!actualDecimalString.isBogus()) {
 | |
|         result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
 | |
|     }
 | |
|     result.setCharsConsumed(segment);
 | |
|     return segment.length() == 0 || maybeMore;
 | |
| }
 | |
| 
 | |
| bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
 | |
|     if (requireGroupingMatch) {
 | |
|         if (sepType == -1) {
 | |
|             // No such group (prevGroup before first shift).
 | |
|             return true;
 | |
|         } else if (sepType == 0) {
 | |
|             // First group.
 | |
|             if (isPrimary) {
 | |
|                 // No grouping separators is OK.
 | |
|                 return true;
 | |
|             } else {
 | |
|                 return count != 0 && count <= grouping2;
 | |
|             }
 | |
|         } else if (sepType == 1) {
 | |
|             // Middle group.
 | |
|             if (isPrimary) {
 | |
|                 return count == grouping1;
 | |
|             } else {
 | |
|                 return count == grouping2;
 | |
|             }
 | |
|         } else {
 | |
|             U_ASSERT(sepType == 2);
 | |
|             // After the decimal separator.
 | |
|             return true;
 | |
|         }
 | |
|     } else {
 | |
|         if (sepType == 1) {
 | |
|             // #11230: don't accept middle groups with only 1 digit.
 | |
|             return count != 1;
 | |
|         } else {
 | |
|             return true;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
 | |
|     // The common case uses a static leadSet for efficiency.
 | |
|     if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
 | |
|         return segment.startsWith(*leadSet);
 | |
|     }
 | |
|     if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
 | |
|         return true;
 | |
|     }
 | |
|     if (fLocalDigitStrings.isNull()) {
 | |
|         return false;
 | |
|     }
 | |
|     for (int32_t i = 0; i < 10; i++) {
 | |
|         if (segment.startsWith(fLocalDigitStrings[i])) {
 | |
|             return true;
 | |
|         }
 | |
|     }
 | |
|     return false;
 | |
| }
 | |
| 
 | |
| UnicodeString DecimalMatcher::toString() const {
 | |
|     return u"<Decimal>";
 | |
| }
 | |
| 
 | |
| 
 | |
| #endif /* #if !UCONFIG_NO_FORMATTING */
 |