882 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			882 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2016 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| /*
 | |
| *******************************************************************************
 | |
| * Copyright (C) 2013-2015, International Business Machines
 | |
| * Corporation and others.  All Rights Reserved.
 | |
| *******************************************************************************
 | |
| * collationruleparser.cpp
 | |
| *
 | |
| * (replaced the former ucol_tok.cpp)
 | |
| *
 | |
| * created on: 2013apr10
 | |
| * created by: Markus W. Scherer
 | |
| */
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_COLLATION
 | |
| 
 | |
| #include "unicode/normalizer2.h"
 | |
| #include "unicode/parseerr.h"
 | |
| #include "unicode/uchar.h"
 | |
| #include "unicode/ucol.h"
 | |
| #include "unicode/uloc.h"
 | |
| #include "unicode/unistr.h"
 | |
| #include "unicode/utf16.h"
 | |
| #include "charstr.h"
 | |
| #include "cmemory.h"
 | |
| #include "collation.h"
 | |
| #include "collationdata.h"
 | |
| #include "collationruleparser.h"
 | |
| #include "collationsettings.h"
 | |
| #include "collationtailoring.h"
 | |
| #include "cstring.h"
 | |
| #include "patternprops.h"
 | |
| #include "uassert.h"
 | |
| #include "uvectr32.h"
 | |
| 
 | |
| U_NAMESPACE_BEGIN
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
 | |
| const int32_t BEFORE_LENGTH = 7;
 | |
| 
 | |
| }  // namespace
 | |
| 
 | |
| CollationRuleParser::Sink::~Sink() {}
 | |
| 
 | |
| void
 | |
| CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
 | |
| 
 | |
| void
 | |
| CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
 | |
| 
 | |
| CollationRuleParser::Importer::~Importer() {}
 | |
| 
 | |
| CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
 | |
|         : nfd(*Normalizer2::getNFDInstance(errorCode)),
 | |
|           nfc(*Normalizer2::getNFCInstance(errorCode)),
 | |
|           rules(NULL), baseData(base), settings(NULL),
 | |
|           parseError(NULL), errorReason(NULL),
 | |
|           sink(NULL), importer(NULL),
 | |
|           ruleIndex(0) {
 | |
| }
 | |
| 
 | |
| CollationRuleParser::~CollationRuleParser() {
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::parse(const UnicodeString &ruleString,
 | |
|                            CollationSettings &outSettings,
 | |
|                            UParseError *outParseError,
 | |
|                            UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     settings = &outSettings;
 | |
|     parseError = outParseError;
 | |
|     if(parseError != NULL) {
 | |
|         parseError->line = 0;
 | |
|         parseError->offset = -1;
 | |
|         parseError->preContext[0] = 0;
 | |
|         parseError->postContext[0] = 0;
 | |
|     }
 | |
|     errorReason = NULL;
 | |
|     parse(ruleString, errorCode);
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     rules = &ruleString;
 | |
|     ruleIndex = 0;
 | |
| 
 | |
|     while(ruleIndex < rules->length()) {
 | |
|         UChar c = rules->charAt(ruleIndex);
 | |
|         if(PatternProps::isWhiteSpace(c)) {
 | |
|             ++ruleIndex;
 | |
|             continue;
 | |
|         }
 | |
|         switch(c) {
 | |
|         case 0x26:  // '&'
 | |
|             parseRuleChain(errorCode);
 | |
|             break;
 | |
|         case 0x5b:  // '['
 | |
|             parseSetting(errorCode);
 | |
|             break;
 | |
|         case 0x23:  // '#' starts a comment, until the end of the line
 | |
|             ruleIndex = skipComment(ruleIndex + 1);
 | |
|             break;
 | |
|         case 0x40:  // '@' is equivalent to [backwards 2]
 | |
|             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
 | |
|                               UCOL_ON, 0, errorCode);
 | |
|             ++ruleIndex;
 | |
|             break;
 | |
|         case 0x21:  // '!' used to turn on Thai/Lao character reversal
 | |
|             // Accept but ignore. The root collator has contractions
 | |
|             // that are equivalent to the character reversal, where appropriate.
 | |
|             ++ruleIndex;
 | |
|             break;
 | |
|         default:
 | |
|             setParseError("expected a reset or setting or comment", errorCode);
 | |
|             break;
 | |
|         }
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|     }
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
 | |
|     int32_t resetStrength = parseResetAndPosition(errorCode);
 | |
|     UBool isFirstRelation = TRUE;
 | |
|     for(;;) {
 | |
|         int32_t result = parseRelationOperator(errorCode);
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|         if(result < 0) {
 | |
|             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
 | |
|                 // '#' starts a comment, until the end of the line
 | |
|                 ruleIndex = skipComment(ruleIndex + 1);
 | |
|                 continue;
 | |
|             }
 | |
|             if(isFirstRelation) {
 | |
|                 setParseError("reset not followed by a relation", errorCode);
 | |
|             }
 | |
|             return;
 | |
|         }
 | |
|         int32_t strength = result & STRENGTH_MASK;
 | |
|         if(resetStrength < UCOL_IDENTICAL) {
 | |
|             // reset-before rule chain
 | |
|             if(isFirstRelation) {
 | |
|                 if(strength != resetStrength) {
 | |
|                     setParseError("reset-before strength differs from its first relation", errorCode);
 | |
|                     return;
 | |
|                 }
 | |
|             } else {
 | |
|                 if(strength < resetStrength) {
 | |
|                     setParseError("reset-before strength followed by a stronger relation", errorCode);
 | |
|                     return;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
 | |
|         if((result & STARRED_FLAG) == 0) {
 | |
|             parseRelationStrings(strength, i, errorCode);
 | |
|         } else {
 | |
|             parseStarredCharacters(strength, i, errorCode);
 | |
|         }
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|         isFirstRelation = FALSE;
 | |
|     }
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
 | |
|     int32_t i = skipWhiteSpace(ruleIndex + 1);
 | |
|     int32_t j;
 | |
|     UChar c;
 | |
|     int32_t resetStrength;
 | |
|     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
 | |
|             (j = i + BEFORE_LENGTH) < rules->length() &&
 | |
|             PatternProps::isWhiteSpace(rules->charAt(j)) &&
 | |
|             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
 | |
|             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
 | |
|             rules->charAt(j + 1) == 0x5d) {
 | |
|         // &[before n] with n=1 or 2 or 3
 | |
|         resetStrength = UCOL_PRIMARY + (c - 0x31);
 | |
|         i = skipWhiteSpace(j + 2);
 | |
|     } else {
 | |
|         resetStrength = UCOL_IDENTICAL;
 | |
|     }
 | |
|     if(i >= rules->length()) {
 | |
|         setParseError("reset without position", errorCode);
 | |
|         return UCOL_DEFAULT;
 | |
|     }
 | |
|     UnicodeString str;
 | |
|     if(rules->charAt(i) == 0x5b) {  // '['
 | |
|         i = parseSpecialPosition(i, str, errorCode);
 | |
|     } else {
 | |
|         i = parseTailoringString(i, str, errorCode);
 | |
|     }
 | |
|     sink->addReset(resetStrength, str, errorReason, errorCode);
 | |
|     if(U_FAILURE(errorCode)) { setErrorContext(); }
 | |
|     ruleIndex = i;
 | |
|     return resetStrength;
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
 | |
|     ruleIndex = skipWhiteSpace(ruleIndex);
 | |
|     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
 | |
|     int32_t strength;
 | |
|     int32_t i = ruleIndex;
 | |
|     UChar c = rules->charAt(i++);
 | |
|     switch(c) {
 | |
|     case 0x3c:  // '<'
 | |
|         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
 | |
|             ++i;
 | |
|             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
 | |
|                 ++i;
 | |
|                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
 | |
|                     ++i;
 | |
|                     strength = UCOL_QUATERNARY;
 | |
|                 } else {
 | |
|                     strength = UCOL_TERTIARY;
 | |
|                 }
 | |
|             } else {
 | |
|                 strength = UCOL_SECONDARY;
 | |
|             }
 | |
|         } else {
 | |
|             strength = UCOL_PRIMARY;
 | |
|         }
 | |
|         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
 | |
|             ++i;
 | |
|             strength |= STARRED_FLAG;
 | |
|         }
 | |
|         break;
 | |
|     case 0x3b:  // ';' same as <<
 | |
|         strength = UCOL_SECONDARY;
 | |
|         break;
 | |
|     case 0x2c:  // ',' same as <<<
 | |
|         strength = UCOL_TERTIARY;
 | |
|         break;
 | |
|     case 0x3d:  // '='
 | |
|         strength = UCOL_IDENTICAL;
 | |
|         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
 | |
|             ++i;
 | |
|             strength |= STARRED_FLAG;
 | |
|         }
 | |
|         break;
 | |
|     default:
 | |
|         return UCOL_DEFAULT;
 | |
|     }
 | |
|     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
 | |
|     // Parse
 | |
|     //     prefix | str / extension
 | |
|     // where prefix and extension are optional.
 | |
|     UnicodeString prefix, str, extension;
 | |
|     i = parseTailoringString(i, str, errorCode);
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
 | |
|     if(next == 0x7c) {  // '|' separates the context prefix from the string.
 | |
|         prefix = str;
 | |
|         i = parseTailoringString(i + 1, str, errorCode);
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|         next = (i < rules->length()) ? rules->charAt(i) : 0;
 | |
|     }
 | |
|     if(next == 0x2f) {  // '/' separates the string from the extension.
 | |
|         i = parseTailoringString(i + 1, extension, errorCode);
 | |
|     }
 | |
|     if(!prefix.isEmpty()) {
 | |
|         UChar32 prefix0 = prefix.char32At(0);
 | |
|         UChar32 c = str.char32At(0);
 | |
|         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
 | |
|             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
 | |
|                           errorCode);
 | |
|             return;
 | |
|         }
 | |
|     }
 | |
|     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
 | |
|     if(U_FAILURE(errorCode)) { setErrorContext(); }
 | |
|     ruleIndex = i;
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
 | |
|     UnicodeString empty, raw;
 | |
|     i = parseString(skipWhiteSpace(i), raw, errorCode);
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     if(raw.isEmpty()) {
 | |
|         setParseError("missing starred-relation string", errorCode);
 | |
|         return;
 | |
|     }
 | |
|     UChar32 prev = -1;
 | |
|     int32_t j = 0;
 | |
|     for(;;) {
 | |
|         while(j < raw.length()) {
 | |
|             UChar32 c = raw.char32At(j);
 | |
|             if(!nfd.isInert(c)) {
 | |
|                 setParseError("starred-relation string is not all NFD-inert", errorCode);
 | |
|                 return;
 | |
|             }
 | |
|             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
 | |
|             if(U_FAILURE(errorCode)) {
 | |
|                 setErrorContext();
 | |
|                 return;
 | |
|             }
 | |
|             j += U16_LENGTH(c);
 | |
|             prev = c;
 | |
|         }
 | |
|         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
 | |
|             break;
 | |
|         }
 | |
|         if(prev < 0) {
 | |
|             setParseError("range without start in starred-relation string", errorCode);
 | |
|             return;
 | |
|         }
 | |
|         i = parseString(i + 1, raw, errorCode);
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|         if(raw.isEmpty()) {
 | |
|             setParseError("range without end in starred-relation string", errorCode);
 | |
|             return;
 | |
|         }
 | |
|         UChar32 c = raw.char32At(0);
 | |
|         if(c < prev) {
 | |
|             setParseError("range start greater than end in starred-relation string", errorCode);
 | |
|             return;
 | |
|         }
 | |
|         // range prev-c
 | |
|         UnicodeString s;
 | |
|         while(++prev <= c) {
 | |
|             if(!nfd.isInert(prev)) {
 | |
|                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
 | |
|                 return;
 | |
|             }
 | |
|             if(U_IS_SURROGATE(prev)) {
 | |
|                 setParseError("starred-relation string range contains a surrogate", errorCode);
 | |
|                 return;
 | |
|             }
 | |
|             if(0xfffd <= prev && prev <= 0xffff) {
 | |
|                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
 | |
|                 return;
 | |
|             }
 | |
|             s.setTo(prev);
 | |
|             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
 | |
|             if(U_FAILURE(errorCode)) {
 | |
|                 setErrorContext();
 | |
|                 return;
 | |
|             }
 | |
|         }
 | |
|         prev = -1;
 | |
|         j = U16_LENGTH(c);
 | |
|     }
 | |
|     ruleIndex = skipWhiteSpace(i);
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
 | |
|     i = parseString(skipWhiteSpace(i), raw, errorCode);
 | |
|     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
 | |
|         setParseError("missing relation string", errorCode);
 | |
|     }
 | |
|     return skipWhiteSpace(i);
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return i; }
 | |
|     raw.remove();
 | |
|     while(i < rules->length()) {
 | |
|         UChar32 c = rules->charAt(i++);
 | |
|         if(isSyntaxChar(c)) {
 | |
|             if(c == 0x27) {  // apostrophe
 | |
|                 if(i < rules->length() && rules->charAt(i) == 0x27) {
 | |
|                     // Double apostrophe, encodes a single one.
 | |
|                     raw.append((UChar)0x27);
 | |
|                     ++i;
 | |
|                     continue;
 | |
|                 }
 | |
|                 // Quote literal text until the next single apostrophe.
 | |
|                 for(;;) {
 | |
|                     if(i == rules->length()) {
 | |
|                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
 | |
|                         return i;
 | |
|                     }
 | |
|                     c = rules->charAt(i++);
 | |
|                     if(c == 0x27) {
 | |
|                         if(i < rules->length() && rules->charAt(i) == 0x27) {
 | |
|                             // Double apostrophe inside quoted literal text,
 | |
|                             // still encodes a single apostrophe.
 | |
|                             ++i;
 | |
|                         } else {
 | |
|                             break;
 | |
|                         }
 | |
|                     }
 | |
|                     raw.append((UChar)c);
 | |
|                 }
 | |
|             } else if(c == 0x5c) {  // backslash
 | |
|                 if(i == rules->length()) {
 | |
|                     setParseError("backslash escape at the end of the rule string", errorCode);
 | |
|                     return i;
 | |
|                 }
 | |
|                 c = rules->char32At(i);
 | |
|                 raw.append(c);
 | |
|                 i += U16_LENGTH(c);
 | |
|             } else {
 | |
|                 // Any other syntax character terminates a string.
 | |
|                 --i;
 | |
|                 break;
 | |
|             }
 | |
|         } else if(PatternProps::isWhiteSpace(c)) {
 | |
|             // Unquoted white space terminates a string.
 | |
|             --i;
 | |
|             break;
 | |
|         } else {
 | |
|             raw.append((UChar)c);
 | |
|         }
 | |
|     }
 | |
|     for(int32_t j = 0; j < raw.length();) {
 | |
|         UChar32 c = raw.char32At(j);
 | |
|         if(U_IS_SURROGATE(c)) {
 | |
|             setParseError("string contains an unpaired surrogate", errorCode);
 | |
|             return i;
 | |
|         }
 | |
|         if(0xfffd <= c && c <= 0xffff) {
 | |
|             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
 | |
|             return i;
 | |
|         }
 | |
|         j += U16_LENGTH(c);
 | |
|     }
 | |
|     return i;
 | |
| }
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| static const char *const positions[] = {
 | |
|     "first tertiary ignorable",
 | |
|     "last tertiary ignorable",
 | |
|     "first secondary ignorable",
 | |
|     "last secondary ignorable",
 | |
|     "first primary ignorable",
 | |
|     "last primary ignorable",
 | |
|     "first variable",
 | |
|     "last variable",
 | |
|     "first regular",
 | |
|     "last regular",
 | |
|     "first implicit",
 | |
|     "last implicit",
 | |
|     "first trailing",
 | |
|     "last trailing"
 | |
| };
 | |
| 
 | |
| }  // namespace
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return 0; }
 | |
|     UnicodeString raw;
 | |
|     int32_t j = readWords(i + 1, raw);
 | |
|     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
 | |
|         ++j;
 | |
|         for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
 | |
|             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
 | |
|                 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
 | |
|                 return j;
 | |
|             }
 | |
|         }
 | |
|         if(raw == UNICODE_STRING_SIMPLE("top")) {
 | |
|             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
 | |
|             return j;
 | |
|         }
 | |
|         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
 | |
|             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
 | |
|             return j;
 | |
|         }
 | |
|     }
 | |
|     setParseError("not a valid special reset position", errorCode);
 | |
|     return i;
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::parseSetting(UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     UnicodeString raw;
 | |
|     int32_t i = ruleIndex + 1;
 | |
|     int32_t j = readWords(i, raw);
 | |
|     if(j <= i || raw.isEmpty()) {
 | |
|         setParseError("expected a setting/option at '['", errorCode);
 | |
|     }
 | |
|     if(rules->charAt(j) == 0x5d) {  // words end with ]
 | |
|         ++j;
 | |
|         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
 | |
|                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
 | |
|             parseReordering(raw, errorCode);
 | |
|             ruleIndex = j;
 | |
|             return;
 | |
|         }
 | |
|         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
 | |
|             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
 | |
|                               UCOL_ON, 0, errorCode);
 | |
|             ruleIndex = j;
 | |
|             return;
 | |
|         }
 | |
|         UnicodeString v;
 | |
|         int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
 | |
|         if(valueIndex >= 0) {
 | |
|             v.setTo(raw, valueIndex + 1);
 | |
|             raw.truncate(valueIndex);
 | |
|         }
 | |
|         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
 | |
|             int32_t value = UCOL_DEFAULT;
 | |
|             UChar c = v.charAt(0);
 | |
|             if(0x31 <= c && c <= 0x34) {  // 1..4
 | |
|                 value = UCOL_PRIMARY + (c - 0x31);
 | |
|             } else if(c == 0x49) {  // 'I'
 | |
|                 value = UCOL_IDENTICAL;
 | |
|             }
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 settings->setStrength(value, 0, errorCode);
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
 | |
|             UColAttributeValue value = UCOL_DEFAULT;
 | |
|             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
 | |
|                 value = UCOL_NON_IGNORABLE;
 | |
|             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
 | |
|                 value = UCOL_SHIFTED;
 | |
|             }
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 settings->setAlternateHandling(value, 0, errorCode);
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
 | |
|             int32_t value = UCOL_DEFAULT;
 | |
|             if(v == UNICODE_STRING_SIMPLE("space")) {
 | |
|                 value = CollationSettings::MAX_VAR_SPACE;
 | |
|             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
 | |
|                 value = CollationSettings::MAX_VAR_PUNCT;
 | |
|             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
 | |
|                 value = CollationSettings::MAX_VAR_SYMBOL;
 | |
|             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
 | |
|                 value = CollationSettings::MAX_VAR_CURRENCY;
 | |
|             }
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 settings->setMaxVariable(value, 0, errorCode);
 | |
|                 settings->variableTop = baseData->getLastPrimaryForGroup(
 | |
|                     UCOL_REORDER_CODE_FIRST + value);
 | |
|                 U_ASSERT(settings->variableTop != 0);
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
 | |
|             UColAttributeValue value = UCOL_DEFAULT;
 | |
|             if(v == UNICODE_STRING_SIMPLE("off")) {
 | |
|                 value = UCOL_OFF;
 | |
|             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
 | |
|                 value = UCOL_LOWER_FIRST;
 | |
|             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
 | |
|                 value = UCOL_UPPER_FIRST;
 | |
|             }
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 settings->setCaseFirst(value, 0, errorCode);
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
 | |
|             UColAttributeValue value = getOnOffValue(v);
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
 | |
|             UColAttributeValue value = getOnOffValue(v);
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
 | |
|             UColAttributeValue value = getOnOffValue(v);
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
 | |
|             UColAttributeValue value = getOnOffValue(v);
 | |
|             if(value != UCOL_DEFAULT) {
 | |
|                 if(value == UCOL_ON) {
 | |
|                     setParseError("[hiraganaQ on] is not supported", errorCode);
 | |
|                 }
 | |
|                 ruleIndex = j;
 | |
|                 return;
 | |
|             }
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
 | |
|             CharString lang;
 | |
|             lang.appendInvariantChars(v, errorCode);
 | |
|             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
 | |
|             // BCP 47 language tag -> ICU locale ID
 | |
|             char localeID[ULOC_FULLNAME_CAPACITY];
 | |
|             int32_t parsedLength;
 | |
|             int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
 | |
|                                                  &parsedLength, &errorCode);
 | |
|             if(U_FAILURE(errorCode) ||
 | |
|                     parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
 | |
|                 errorCode = U_ZERO_ERROR;
 | |
|                 setParseError("expected language tag in [import langTag]", errorCode);
 | |
|                 return;
 | |
|             }
 | |
|             // localeID minus all keywords
 | |
|             char baseID[ULOC_FULLNAME_CAPACITY];
 | |
|             length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
 | |
|             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
 | |
|                 errorCode = U_ZERO_ERROR;
 | |
|                 setParseError("expected language tag in [import langTag]", errorCode);
 | |
|                 return;
 | |
|             }
 | |
|             if(length == 0) {
 | |
|                 uprv_strcpy(baseID, "root");
 | |
|             } else if(*baseID == '_') {
 | |
|                 uprv_memmove(baseID + 3, baseID, length + 1);
 | |
|                 uprv_memcpy(baseID, "und", 3);
 | |
|             }
 | |
|             // @collation=type, or length=0 if not specified
 | |
|             char collationType[ULOC_KEYWORDS_CAPACITY];
 | |
|             length = uloc_getKeywordValue(localeID, "collation",
 | |
|                                           collationType, ULOC_KEYWORDS_CAPACITY,
 | |
|                                           &errorCode);
 | |
|             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
 | |
|                 errorCode = U_ZERO_ERROR;
 | |
|                 setParseError("expected language tag in [import langTag]", errorCode);
 | |
|                 return;
 | |
|             }
 | |
|             if(importer == NULL) {
 | |
|                 setParseError("[import langTag] is not supported", errorCode);
 | |
|             } else {
 | |
|                 UnicodeString importedRules;
 | |
|                 importer->getRules(baseID, length > 0 ? collationType : "standard",
 | |
|                                    importedRules, errorReason, errorCode);
 | |
|                 if(U_FAILURE(errorCode)) {
 | |
|                     if(errorReason == NULL) {
 | |
|                         errorReason = "[import langTag] failed";
 | |
|                     }
 | |
|                     setErrorContext();
 | |
|                     return;
 | |
|                 }
 | |
|                 const UnicodeString *outerRules = rules;
 | |
|                 int32_t outerRuleIndex = ruleIndex;
 | |
|                 parse(importedRules, errorCode);
 | |
|                 if(U_FAILURE(errorCode)) {
 | |
|                     if(parseError != NULL) {
 | |
|                         parseError->offset = outerRuleIndex;
 | |
|                     }
 | |
|                 }
 | |
|                 rules = outerRules;
 | |
|                 ruleIndex = j;
 | |
|             }
 | |
|             return;
 | |
|         }
 | |
|     } else if(rules->charAt(j) == 0x5b) {  // words end with [
 | |
|         UnicodeSet set;
 | |
|         j = parseUnicodeSet(j, set, errorCode);
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
 | |
|             sink->optimize(set, errorReason, errorCode);
 | |
|             if(U_FAILURE(errorCode)) { setErrorContext(); }
 | |
|             ruleIndex = j;
 | |
|             return;
 | |
|         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
 | |
|             sink->suppressContractions(set, errorReason, errorCode);
 | |
|             if(U_FAILURE(errorCode)) { setErrorContext(); }
 | |
|             ruleIndex = j;
 | |
|             return;
 | |
|         }
 | |
|     }
 | |
|     setParseError("not a valid setting/option", errorCode);
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     int32_t i = 7;  // after "reorder"
 | |
|     if(i == raw.length()) {
 | |
|         // empty [reorder] with no codes
 | |
|         settings->resetReordering();
 | |
|         return;
 | |
|     }
 | |
|     // Parse the codes in [reorder aa bb cc].
 | |
|     UVector32 reorderCodes(errorCode);
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     CharString word;
 | |
|     while(i < raw.length()) {
 | |
|         ++i;  // skip the word-separating space
 | |
|         int32_t limit = raw.indexOf((UChar)0x20, i);
 | |
|         if(limit < 0) { limit = raw.length(); }
 | |
|         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|         int32_t code = getReorderCode(word.data());
 | |
|         if(code < 0) {
 | |
|             setParseError("unknown script or reorder code", errorCode);
 | |
|             return;
 | |
|         }
 | |
|         reorderCodes.addElement(code, errorCode);
 | |
|         if(U_FAILURE(errorCode)) { return; }
 | |
|         i = limit;
 | |
|     }
 | |
|     settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
 | |
| }
 | |
| 
 | |
| static const char *const gSpecialReorderCodes[] = {
 | |
|     "space", "punct", "symbol", "currency", "digit"
 | |
| };
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::getReorderCode(const char *word) {
 | |
|     for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
 | |
|         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
 | |
|             return UCOL_REORDER_CODE_FIRST + i;
 | |
|         }
 | |
|     }
 | |
|     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
 | |
|     if(script >= 0) {
 | |
|         return script;
 | |
|     }
 | |
|     if(uprv_stricmp(word, "others") == 0) {
 | |
|         return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
 | |
|     }
 | |
|     return -1;
 | |
| }
 | |
| 
 | |
| UColAttributeValue
 | |
| CollationRuleParser::getOnOffValue(const UnicodeString &s) {
 | |
|     if(s == UNICODE_STRING_SIMPLE("on")) {
 | |
|         return UCOL_ON;
 | |
|     } else if(s == UNICODE_STRING_SIMPLE("off")) {
 | |
|         return UCOL_OFF;
 | |
|     } else {
 | |
|         return UCOL_DEFAULT;
 | |
|     }
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
 | |
|     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
 | |
|     int32_t level = 0;
 | |
|     int32_t j = i;
 | |
|     for(;;) {
 | |
|         if(j == rules->length()) {
 | |
|             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
 | |
|             return j;
 | |
|         }
 | |
|         UChar c = rules->charAt(j++);
 | |
|         if(c == 0x5b) {  // '['
 | |
|             ++level;
 | |
|         } else if(c == 0x5d) {  // ']'
 | |
|             if(--level == 0) { break; }
 | |
|         }
 | |
|     }
 | |
|     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
 | |
|     if(U_FAILURE(errorCode)) {
 | |
|         errorCode = U_ZERO_ERROR;
 | |
|         setParseError("not a valid UnicodeSet pattern", errorCode);
 | |
|         return j;
 | |
|     }
 | |
|     j = skipWhiteSpace(j);
 | |
|     if(j == rules->length() || rules->charAt(j) != 0x5d) {
 | |
|         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
 | |
|         return j;
 | |
|     }
 | |
|     return ++j;
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
 | |
|     static const UChar sp = 0x20;
 | |
|     raw.remove();
 | |
|     i = skipWhiteSpace(i);
 | |
|     for(;;) {
 | |
|         if(i >= rules->length()) { return 0; }
 | |
|         UChar c = rules->charAt(i);
 | |
|         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
 | |
|             if(raw.isEmpty()) { return i; }
 | |
|             if(raw.endsWith(&sp, 1)) {  // remove trailing space
 | |
|                 raw.truncate(raw.length() - 1);
 | |
|             }
 | |
|             return i;
 | |
|         }
 | |
|         if(PatternProps::isWhiteSpace(c)) {
 | |
|             raw.append(sp);
 | |
|             i = skipWhiteSpace(i + 1);
 | |
|         } else {
 | |
|             raw.append(c);
 | |
|             ++i;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::skipComment(int32_t i) const {
 | |
|     // skip to past the newline
 | |
|     while(i < rules->length()) {
 | |
|         UChar c = rules->charAt(i++);
 | |
|         // LF or FF or CR or NEL or LS or PS
 | |
|         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
 | |
|             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
 | |
|             // NLF (new line function) = CR or LF or CR+LF or NEL.
 | |
|             // No need to collect all of CR+LF because a following LF will be ignored anyway.
 | |
|             break;
 | |
|         }
 | |
|     }
 | |
|     return i;
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return; }
 | |
|     // Error code consistent with the old parser (from ca. 2001),
 | |
|     // rather than U_PARSE_ERROR;
 | |
|     errorCode = U_INVALID_FORMAT_ERROR;
 | |
|     errorReason = reason;
 | |
|     if(parseError != NULL) { setErrorContext(); }
 | |
| }
 | |
| 
 | |
| void
 | |
| CollationRuleParser::setErrorContext() {
 | |
|     if(parseError == NULL) { return; }
 | |
| 
 | |
|     // Note: This relies on the calling code maintaining the ruleIndex
 | |
|     // at a position that is useful for debugging.
 | |
|     // For example, at the beginning of a reset or relation etc.
 | |
|     parseError->offset = ruleIndex;
 | |
|     parseError->line = 0;  // We are not counting line numbers.
 | |
| 
 | |
|     // before ruleIndex
 | |
|     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
 | |
|     if(start < 0) {
 | |
|         start = 0;
 | |
|     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
 | |
|         ++start;
 | |
|     }
 | |
|     int32_t length = ruleIndex - start;
 | |
|     rules->extract(start, length, parseError->preContext);
 | |
|     parseError->preContext[length] = 0;
 | |
| 
 | |
|     // starting from ruleIndex
 | |
|     length = rules->length() - ruleIndex;
 | |
|     if(length >= U_PARSE_CONTEXT_LEN) {
 | |
|         length = U_PARSE_CONTEXT_LEN - 1;
 | |
|         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
 | |
|             --length;
 | |
|         }
 | |
|     }
 | |
|     rules->extract(ruleIndex, length, parseError->postContext);
 | |
|     parseError->postContext[length] = 0;
 | |
| }
 | |
| 
 | |
| UBool
 | |
| CollationRuleParser::isSyntaxChar(UChar32 c) {
 | |
|     return 0x21 <= c && c <= 0x7e &&
 | |
|             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
 | |
|             (0x5b <= c && c <= 0x60) || (0x7b <= c));
 | |
| }
 | |
| 
 | |
| int32_t
 | |
| CollationRuleParser::skipWhiteSpace(int32_t i) const {
 | |
|     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
 | |
|         ++i;
 | |
|     }
 | |
|     return i;
 | |
| }
 | |
| 
 | |
| U_NAMESPACE_END
 | |
| 
 | |
| #endif  // !UCONFIG_NO_COLLATION
 |