357 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			357 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2016 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| /*
 | |
| *******************************************************************************
 | |
| * Copyright (C) 1996-2015, International Business Machines
 | |
| * Corporation and others.  All Rights Reserved.
 | |
| *******************************************************************************
 | |
| * collationcompare.cpp
 | |
| *
 | |
| * created on: 2012feb14 with new and old collation code
 | |
| * created by: Markus W. Scherer
 | |
| */
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_COLLATION
 | |
| 
 | |
| #include "unicode/ucol.h"
 | |
| #include "cmemory.h"
 | |
| #include "collation.h"
 | |
| #include "collationcompare.h"
 | |
| #include "collationiterator.h"
 | |
| #include "collationsettings.h"
 | |
| #include "uassert.h"
 | |
| 
 | |
| U_NAMESPACE_BEGIN
 | |
| 
 | |
| UCollationResult
 | |
| CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
 | |
|                                         const CollationSettings &settings,
 | |
|                                         UErrorCode &errorCode) {
 | |
|     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
 | |
| 
 | |
|     int32_t options = settings.options;
 | |
|     uint32_t variableTop;
 | |
|     if((options & CollationSettings::ALTERNATE_MASK) == 0) {
 | |
|         variableTop = 0;
 | |
|     } else {
 | |
|         // +1 so that we can use "<" and primary ignorables test out early.
 | |
|         variableTop = settings.variableTop + 1;
 | |
|     }
 | |
|     UBool anyVariable = FALSE;
 | |
| 
 | |
|     // Fetch CEs, compare primaries, store secondary & tertiary weights.
 | |
|     for(;;) {
 | |
|         // We fetch CEs until we get a non-ignorable primary or reach the end.
 | |
|         uint32_t leftPrimary;
 | |
|         do {
 | |
|             int64_t ce = left.nextCE(errorCode);
 | |
|             leftPrimary = (uint32_t)(ce >> 32);
 | |
|             if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
 | |
|                 // Variable CE, shift it to quaternary level.
 | |
|                 // Ignore all following primary ignorables, and shift further variable CEs.
 | |
|                 anyVariable = TRUE;
 | |
|                 do {
 | |
|                     // Store only the primary of the variable CE.
 | |
|                     left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
 | |
|                     for(;;) {
 | |
|                         ce = left.nextCE(errorCode);
 | |
|                         leftPrimary = (uint32_t)(ce >> 32);
 | |
|                         if(leftPrimary == 0) {
 | |
|                             left.setCurrentCE(0);
 | |
|                         } else {
 | |
|                             break;
 | |
|                         }
 | |
|                     }
 | |
|                 } while(leftPrimary < variableTop &&
 | |
|                         leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
 | |
|             }
 | |
|         } while(leftPrimary == 0);
 | |
| 
 | |
|         uint32_t rightPrimary;
 | |
|         do {
 | |
|             int64_t ce = right.nextCE(errorCode);
 | |
|             rightPrimary = (uint32_t)(ce >> 32);
 | |
|             if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
 | |
|                 // Variable CE, shift it to quaternary level.
 | |
|                 // Ignore all following primary ignorables, and shift further variable CEs.
 | |
|                 anyVariable = TRUE;
 | |
|                 do {
 | |
|                     // Store only the primary of the variable CE.
 | |
|                     right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
 | |
|                     for(;;) {
 | |
|                         ce = right.nextCE(errorCode);
 | |
|                         rightPrimary = (uint32_t)(ce >> 32);
 | |
|                         if(rightPrimary == 0) {
 | |
|                             right.setCurrentCE(0);
 | |
|                         } else {
 | |
|                             break;
 | |
|                         }
 | |
|                     }
 | |
|                 } while(rightPrimary < variableTop &&
 | |
|                         rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
 | |
|             }
 | |
|         } while(rightPrimary == 0);
 | |
| 
 | |
|         if(leftPrimary != rightPrimary) {
 | |
|             // Return the primary difference, with script reordering.
 | |
|             if(settings.hasReordering()) {
 | |
|                 leftPrimary = settings.reorder(leftPrimary);
 | |
|                 rightPrimary = settings.reorder(rightPrimary);
 | |
|             }
 | |
|             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
 | |
|         }
 | |
|         if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
 | |
|     }
 | |
|     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
 | |
| 
 | |
|     // Compare the buffered secondary & tertiary weights.
 | |
|     // We might skip the secondary level but continue with the case level
 | |
|     // which is turned on separately.
 | |
|     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
 | |
|         if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
 | |
|             int32_t leftIndex = 0;
 | |
|             int32_t rightIndex = 0;
 | |
|             for(;;) {
 | |
|                 uint32_t leftSecondary;
 | |
|                 do {
 | |
|                     leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
 | |
|                 } while(leftSecondary == 0);
 | |
| 
 | |
|                 uint32_t rightSecondary;
 | |
|                 do {
 | |
|                     rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
 | |
|                 } while(rightSecondary == 0);
 | |
| 
 | |
|                 if(leftSecondary != rightSecondary) {
 | |
|                     return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
 | |
|                 }
 | |
|                 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
 | |
|             }
 | |
|         } else {
 | |
|             // The backwards secondary level compares secondary weights backwards
 | |
|             // within segments separated by the merge separator (U+FFFE, weight 02).
 | |
|             int32_t leftStart = 0;
 | |
|             int32_t rightStart = 0;
 | |
|             for(;;) {
 | |
|                 // Find the merge separator or the NO_CE terminator.
 | |
|                 uint32_t p;
 | |
|                 int32_t leftLimit = leftStart;
 | |
|                 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
 | |
|                             Collation::MERGE_SEPARATOR_PRIMARY ||
 | |
|                         p == 0) {
 | |
|                     ++leftLimit;
 | |
|                 }
 | |
|                 int32_t rightLimit = rightStart;
 | |
|                 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
 | |
|                             Collation::MERGE_SEPARATOR_PRIMARY ||
 | |
|                         p == 0) {
 | |
|                     ++rightLimit;
 | |
|                 }
 | |
| 
 | |
|                 // Compare the segments.
 | |
|                 int32_t leftIndex = leftLimit;
 | |
|                 int32_t rightIndex = rightLimit;
 | |
|                 for(;;) {
 | |
|                     int32_t leftSecondary = 0;
 | |
|                     while(leftSecondary == 0 && leftIndex > leftStart) {
 | |
|                         leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
 | |
|                     }
 | |
| 
 | |
|                     int32_t rightSecondary = 0;
 | |
|                     while(rightSecondary == 0 && rightIndex > rightStart) {
 | |
|                         rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
 | |
|                     }
 | |
| 
 | |
|                     if(leftSecondary != rightSecondary) {
 | |
|                         return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
 | |
|                     }
 | |
|                     if(leftSecondary == 0) { break; }
 | |
|                 }
 | |
| 
 | |
|                 // Did we reach the end of either string?
 | |
|                 // Both strings have the same number of merge separators,
 | |
|                 // or else there would have been a primary-level difference.
 | |
|                 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
 | |
|                 if(p == Collation::NO_CE_PRIMARY) { break; }
 | |
|                 // Skip both merge separators and continue.
 | |
|                 leftStart = leftLimit + 1;
 | |
|                 rightStart = rightLimit + 1;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if((options & CollationSettings::CASE_LEVEL) != 0) {
 | |
|         int32_t strength = CollationSettings::getStrength(options);
 | |
|         int32_t leftIndex = 0;
 | |
|         int32_t rightIndex = 0;
 | |
|         for(;;) {
 | |
|             uint32_t leftCase, leftLower32, rightCase;
 | |
|             if(strength == UCOL_PRIMARY) {
 | |
|                 // Primary+caseLevel: Ignore case level weights of primary ignorables.
 | |
|                 // Otherwise we would get a-umlaut > a
 | |
|                 // which is not desirable for accent-insensitive sorting.
 | |
|                 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
 | |
|                 // with only primary weights.
 | |
|                 int64_t ce;
 | |
|                 do {
 | |
|                     ce = left.getCE(leftIndex++);
 | |
|                     leftCase = (uint32_t)ce;
 | |
|                 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
 | |
|                 leftLower32 = leftCase;
 | |
|                 leftCase &= 0xc000;
 | |
| 
 | |
|                 do {
 | |
|                     ce = right.getCE(rightIndex++);
 | |
|                     rightCase = (uint32_t)ce;
 | |
|                 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
 | |
|                 rightCase &= 0xc000;
 | |
|             } else {
 | |
|                 // Secondary+caseLevel: By analogy with the above,
 | |
|                 // ignore case level weights of secondary ignorables.
 | |
|                 //
 | |
|                 // Note: A tertiary CE has uppercase case bits (0.0.ut)
 | |
|                 // to keep tertiary+caseFirst well-formed.
 | |
|                 //
 | |
|                 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
 | |
|                 // Otherwise a tertiary CE's uppercase would be no greater than
 | |
|                 // a primary/secondary CE's uppercase.
 | |
|                 // (See UCA well-formedness condition 2.)
 | |
|                 // We could construct a special case weight higher than uppercase,
 | |
|                 // but it's simpler to always ignore case weights of secondary ignorables,
 | |
|                 // turning 0.0.ut into 0.0.0.t.
 | |
|                 // (See LDML Collation, Case Parameters.)
 | |
|                 do {
 | |
|                     leftCase = (uint32_t)left.getCE(leftIndex++);
 | |
|                 } while(leftCase <= 0xffff);
 | |
|                 leftLower32 = leftCase;
 | |
|                 leftCase &= 0xc000;
 | |
| 
 | |
|                 do {
 | |
|                     rightCase = (uint32_t)right.getCE(rightIndex++);
 | |
|                 } while(rightCase <= 0xffff);
 | |
|                 rightCase &= 0xc000;
 | |
|             }
 | |
| 
 | |
|             // No need to handle NO_CE and MERGE_SEPARATOR specially:
 | |
|             // There is one case weight for each previous-level weight,
 | |
|             // so level length differences were handled there.
 | |
|             if(leftCase != rightCase) {
 | |
|                 if((options & CollationSettings::UPPER_FIRST) == 0) {
 | |
|                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
 | |
|                 } else {
 | |
|                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
 | |
|                 }
 | |
|             }
 | |
|             if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
 | |
|         }
 | |
|     }
 | |
|     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
 | |
| 
 | |
|     uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
 | |
| 
 | |
|     int32_t leftIndex = 0;
 | |
|     int32_t rightIndex = 0;
 | |
|     uint32_t anyQuaternaries = 0;
 | |
|     for(;;) {
 | |
|         uint32_t leftLower32, leftTertiary;
 | |
|         do {
 | |
|             leftLower32 = (uint32_t)left.getCE(leftIndex++);
 | |
|             anyQuaternaries |= leftLower32;
 | |
|             U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
 | |
|                      (leftLower32 & 0xc0c0) == 0);
 | |
|             leftTertiary = leftLower32 & tertiaryMask;
 | |
|         } while(leftTertiary == 0);
 | |
| 
 | |
|         uint32_t rightLower32, rightTertiary;
 | |
|         do {
 | |
|             rightLower32 = (uint32_t)right.getCE(rightIndex++);
 | |
|             anyQuaternaries |= rightLower32;
 | |
|             U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
 | |
|                      (rightLower32 & 0xc0c0) == 0);
 | |
|             rightTertiary = rightLower32 & tertiaryMask;
 | |
|         } while(rightTertiary == 0);
 | |
| 
 | |
|         if(leftTertiary != rightTertiary) {
 | |
|             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
 | |
|                 // Pass through NO_CE and keep real tertiary weights larger than that.
 | |
|                 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
 | |
|                 // to keep tertiary CEs well-formed.
 | |
|                 // Their case+tertiary weights must be greater than those of
 | |
|                 // primary and secondary CEs.
 | |
|                 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
 | |
|                     if(leftLower32 > 0xffff) {
 | |
|                         leftTertiary ^= 0xc000;
 | |
|                     } else {
 | |
|                         leftTertiary += 0x4000;
 | |
|                     }
 | |
|                 }
 | |
|                 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
 | |
|                     if(rightLower32 > 0xffff) {
 | |
|                         rightTertiary ^= 0xc000;
 | |
|                     } else {
 | |
|                         rightTertiary += 0x4000;
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
 | |
|         }
 | |
|         if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
 | |
|     }
 | |
|     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
 | |
| 
 | |
|     if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
 | |
|         // If there are no "variable" CEs and no non-zero quaternary weights,
 | |
|         // then there are no quaternary differences.
 | |
|         return UCOL_EQUAL;
 | |
|     }
 | |
| 
 | |
|     leftIndex = 0;
 | |
|     rightIndex = 0;
 | |
|     for(;;) {
 | |
|         uint32_t leftQuaternary;
 | |
|         do {
 | |
|             int64_t ce = left.getCE(leftIndex++);
 | |
|             leftQuaternary = (uint32_t)ce & 0xffff;
 | |
|             if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
 | |
|                 // Variable primary or completely ignorable or NO_CE.
 | |
|                 leftQuaternary = (uint32_t)(ce >> 32);
 | |
|             } else {
 | |
|                 // Regular CE, not tertiary ignorable.
 | |
|                 // Preserve the quaternary weight in bits 7..6.
 | |
|                 leftQuaternary |= 0xffffff3f;
 | |
|             }
 | |
|         } while(leftQuaternary == 0);
 | |
| 
 | |
|         uint32_t rightQuaternary;
 | |
|         do {
 | |
|             int64_t ce = right.getCE(rightIndex++);
 | |
|             rightQuaternary = (uint32_t)ce & 0xffff;
 | |
|             if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
 | |
|                 // Variable primary or completely ignorable or NO_CE.
 | |
|                 rightQuaternary = (uint32_t)(ce >> 32);
 | |
|             } else {
 | |
|                 // Regular CE, not tertiary ignorable.
 | |
|                 // Preserve the quaternary weight in bits 7..6.
 | |
|                 rightQuaternary |= 0xffffff3f;
 | |
|             }
 | |
|         } while(rightQuaternary == 0);
 | |
| 
 | |
|         if(leftQuaternary != rightQuaternary) {
 | |
|             // Return the difference, with script reordering.
 | |
|             if(settings.hasReordering()) {
 | |
|                 leftQuaternary = settings.reorder(leftQuaternary);
 | |
|                 rightQuaternary = settings.reorder(rightQuaternary);
 | |
|             }
 | |
|             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
 | |
|         }
 | |
|         if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
 | |
|     }
 | |
|     return UCOL_EQUAL;
 | |
| }
 | |
| 
 | |
| U_NAMESPACE_END
 | |
| 
 | |
| #endif  // !UCONFIG_NO_COLLATION
 |