175 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			175 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			C++
		
	
	
	
| // © 2016 and later: Unicode, Inc. and others.
 | |
| // License & terms of use: http://www.unicode.org/copyright.html
 | |
| /*
 | |
| *******************************************************************************
 | |
| * Copyright (C) 2012-2016, International Business Machines
 | |
| * Corporation and others.  All Rights Reserved.
 | |
| *******************************************************************************
 | |
| * utf8collationiterator.h
 | |
| *
 | |
| * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
 | |
| * created by: Markus W. Scherer
 | |
| */
 | |
| 
 | |
| #ifndef __UTF8COLLATIONITERATOR_H__
 | |
| #define __UTF8COLLATIONITERATOR_H__
 | |
| 
 | |
| #include "unicode/utypes.h"
 | |
| 
 | |
| #if !UCONFIG_NO_COLLATION
 | |
| 
 | |
| #include "cmemory.h"
 | |
| #include "collation.h"
 | |
| #include "collationdata.h"
 | |
| #include "collationiterator.h"
 | |
| #include "normalizer2impl.h"
 | |
| 
 | |
| U_NAMESPACE_BEGIN
 | |
| 
 | |
| /**
 | |
|  * UTF-8 collation element and character iterator.
 | |
|  * Handles normalized UTF-8 text inline, with length or NUL-terminated.
 | |
|  * Unnormalized text is handled by a subclass.
 | |
|  */
 | |
| class U_I18N_API UTF8CollationIterator : public CollationIterator {
 | |
| public:
 | |
|     UTF8CollationIterator(const CollationData *d, UBool numeric,
 | |
|                           const uint8_t *s, int32_t p, int32_t len)
 | |
|             : CollationIterator(d, numeric),
 | |
|               u8(s), pos(p), length(len) {}
 | |
| 
 | |
|     virtual ~UTF8CollationIterator();
 | |
| 
 | |
|     virtual void resetToOffset(int32_t newOffset) override;
 | |
| 
 | |
|     virtual int32_t getOffset() const override;
 | |
| 
 | |
|     virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
 | |
| 
 | |
|     virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
 | |
| 
 | |
| protected:
 | |
|     /**
 | |
|      * For byte sequences that are illegal in UTF-8, an error value may be returned
 | |
|      * together with a bogus code point. The caller will ignore that code point.
 | |
|      *
 | |
|      * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
 | |
|      * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
 | |
|      *
 | |
|      * Valid lead surrogates are returned from inside a normalized text segment,
 | |
|      * where handleGetTrailSurrogate() will return the matching trail surrogate.
 | |
|      */
 | |
|     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
 | |
| 
 | |
|     virtual UBool foundNULTerminator() override;
 | |
| 
 | |
|     virtual UBool forbidSurrogateCodePoints() const override;
 | |
| 
 | |
|     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
 | |
| 
 | |
|     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
 | |
| 
 | |
|     const uint8_t *u8;
 | |
|     int32_t pos;
 | |
|     int32_t length;  // <0 for NUL-terminated strings
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * Incrementally checks the input text for FCD and normalizes where necessary.
 | |
|  */
 | |
| class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
 | |
| public:
 | |
|     FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
 | |
|                              const uint8_t *s, int32_t p, int32_t len)
 | |
|             : UTF8CollationIterator(data, numeric, s, p, len),
 | |
|               state(CHECK_FWD), start(p),
 | |
|               nfcImpl(data->nfcImpl) {}
 | |
| 
 | |
|     virtual ~FCDUTF8CollationIterator();
 | |
| 
 | |
|     virtual void resetToOffset(int32_t newOffset) override;
 | |
| 
 | |
|     virtual int32_t getOffset() const override;
 | |
| 
 | |
|     virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
 | |
| 
 | |
|     virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
 | |
| 
 | |
| protected:
 | |
|     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
 | |
| 
 | |
|     virtual UChar handleGetTrailSurrogate() override;
 | |
| 
 | |
|     virtual UBool foundNULTerminator() override;
 | |
| 
 | |
|     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
 | |
| 
 | |
|     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
 | |
| 
 | |
| private:
 | |
|     UBool nextHasLccc() const;
 | |
|     UBool previousHasTccc() const;
 | |
| 
 | |
|     /**
 | |
|      * Switches to forward checking if possible.
 | |
|      */
 | |
|     void switchToForward();
 | |
| 
 | |
|     /**
 | |
|      * Extends the FCD text segment forward or normalizes around pos.
 | |
|      * @return true if success
 | |
|      */
 | |
|     UBool nextSegment(UErrorCode &errorCode);
 | |
| 
 | |
|     /**
 | |
|      * Switches to backward checking.
 | |
|      */
 | |
|     void switchToBackward();
 | |
| 
 | |
|     /**
 | |
|      * Extends the FCD text segment backward or normalizes around pos.
 | |
|      * @return true if success
 | |
|      */
 | |
|     UBool previousSegment(UErrorCode &errorCode);
 | |
| 
 | |
|     UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
 | |
| 
 | |
|     enum State {
 | |
|         /**
 | |
|          * The input text [start..pos[ passes the FCD check.
 | |
|          * Moving forward checks incrementally.
 | |
|          * limit is undefined.
 | |
|          */
 | |
|         CHECK_FWD,
 | |
|         /**
 | |
|          * The input text [pos..limit[ passes the FCD check.
 | |
|          * Moving backward checks incrementally.
 | |
|          * start is undefined.
 | |
|          */
 | |
|         CHECK_BWD,
 | |
|         /**
 | |
|          * The input text [start..limit[ passes the FCD check.
 | |
|          * pos tracks the current text index.
 | |
|          */
 | |
|         IN_FCD_SEGMENT,
 | |
|         /**
 | |
|          * The input text [start..limit[ failed the FCD check and was normalized.
 | |
|          * pos tracks the current index in the normalized string.
 | |
|          */
 | |
|         IN_NORMALIZED
 | |
|     };
 | |
| 
 | |
|     State state;
 | |
| 
 | |
|     int32_t start;
 | |
|     int32_t limit;
 | |
| 
 | |
|     const Normalizer2Impl &nfcImpl;
 | |
|     UnicodeString normalized;
 | |
| };
 | |
| 
 | |
| U_NAMESPACE_END
 | |
| 
 | |
| #endif  // !UCONFIG_NO_COLLATION
 | |
| #endif  // __UTF8COLLATIONITERATOR_H__
 |