249 lines
9.9 KiB
C++
249 lines
9.9 KiB
C++
/*
|
|
* Copyright (C) 2015 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/**
|
|
* An implementation of Liang's hyphenation algorithm.
|
|
*/
|
|
|
|
#ifndef MINIKIN_HYPHENATOR_H
|
|
#define MINIKIN_HYPHENATOR_H
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "minikin/Characters.h"
|
|
#include "minikin/U16StringPiece.h"
|
|
|
|
namespace minikin {
|
|
|
|
class Hyphenator;
|
|
|
|
// Registers the hyphenator.
|
|
// This doesn't take ownership of the hyphenator but we don't need to care about the ownership.
|
|
// In Android, the Hyphenator is allocated in Zygote and never gets released.
|
|
void addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator);
|
|
void addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr);
|
|
|
|
enum class HyphenationType : uint8_t {
|
|
// Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.
|
|
|
|
// Do not break.
|
|
DONT_BREAK = 0,
|
|
// Break the line and insert a normal hyphen.
|
|
BREAK_AND_INSERT_HYPHEN = 1,
|
|
// Break the line and insert an Armenian hyphen (U+058A).
|
|
BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
|
|
// Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
|
|
BREAK_AND_INSERT_MAQAF = 3,
|
|
// Break the line and insert a Canadian Syllabics hyphen (U+1400).
|
|
BREAK_AND_INSERT_UCAS_HYPHEN = 4,
|
|
// Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
|
|
// present or the script does not use a hyphen (e.g. in Malayalam).
|
|
BREAK_AND_DONT_INSERT_HYPHEN = 5,
|
|
// Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
|
|
// as "l-/l".
|
|
BREAK_AND_REPLACE_WITH_HYPHEN = 6,
|
|
// Break the line, and repeat the hyphen (which is the last character) at the beginning of the
|
|
// next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
|
|
// "czerwono-/-niebieska") and Slovenian.
|
|
BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
|
|
// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
|
|
// This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
|
|
// behavior when a soft hyphen is used in Arabic script.
|
|
BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
|
|
};
|
|
|
|
// The hyphen edit represents an edit to the string when a word is hyphenated.
|
|
// The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation
|
|
// allows for more choices.
|
|
// One at the beginning of the string/line and one at the end.
|
|
enum class EndHyphenEdit : uint8_t {
|
|
// Note that everything inserting characters must have a value greater than or equal to
|
|
// INSERT_HYPHEN.
|
|
NO_EDIT = 0b000,
|
|
REPLACE_WITH_HYPHEN = 0b001,
|
|
|
|
INSERT_HYPHEN = 0b010,
|
|
INSERT_ARMENIAN_HYPHEN = 0b011,
|
|
INSERT_MAQAF = 0b100,
|
|
INSERT_UCAS_HYPHEN = 0b101,
|
|
INSERT_ZWJ_AND_HYPHEN = 0b110,
|
|
};
|
|
|
|
enum class StartHyphenEdit : uint8_t {
|
|
NO_EDIT = 0b00,
|
|
|
|
INSERT_HYPHEN = 0b01,
|
|
INSERT_ZWJ = 0b10,
|
|
};
|
|
|
|
typedef uint8_t HyphenEdit;
|
|
constexpr uint8_t START_BITS_SHIFT = 3;
|
|
// The following two masks must keep in sync with the definitions in the Java code at:
|
|
// frameworks/base/graphics/java/android/graphics/Paint.java
|
|
constexpr uint8_t MASK_END_OF_LINE = 0b00111;
|
|
constexpr uint8_t MASK_START_OF_LINE = 0b11000;
|
|
|
|
inline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) {
|
|
return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end);
|
|
}
|
|
|
|
inline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) {
|
|
return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE);
|
|
}
|
|
|
|
inline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) {
|
|
return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT);
|
|
}
|
|
|
|
inline bool isReplacement(EndHyphenEdit hyph) {
|
|
return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN;
|
|
}
|
|
|
|
inline bool isInsertion(StartHyphenEdit hyph) {
|
|
return hyph != StartHyphenEdit::NO_EDIT;
|
|
}
|
|
|
|
inline bool isInsertion(EndHyphenEdit hyph) {
|
|
return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN);
|
|
}
|
|
|
|
template <typename T, size_t size>
|
|
constexpr size_t ARRAYSIZE(T const (&)[size]) {
|
|
return size;
|
|
}
|
|
constexpr uint16_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ};
|
|
constexpr uint16_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN};
|
|
constexpr uint16_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN};
|
|
constexpr uint16_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF};
|
|
constexpr uint16_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN};
|
|
constexpr uint16_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN};
|
|
constexpr std::pair<const uint16_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0);
|
|
#define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAYSIZE(chars))
|
|
|
|
inline std::pair<const uint16_t*, size_t> getHyphenString(StartHyphenEdit hyph) {
|
|
if (hyph == StartHyphenEdit::INSERT_ZWJ) {
|
|
return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ);
|
|
} else if (hyph == StartHyphenEdit::INSERT_HYPHEN) {
|
|
return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
|
|
} else {
|
|
return EMPTY_HYPHEN_STR;
|
|
}
|
|
}
|
|
|
|
inline std::pair<const uint16_t*, size_t> getHyphenString(EndHyphenEdit hyph) {
|
|
switch (hyph) {
|
|
case EndHyphenEdit::REPLACE_WITH_HYPHEN: // fall through
|
|
case EndHyphenEdit::INSERT_HYPHEN:
|
|
return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
|
|
case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN:
|
|
return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN);
|
|
case EndHyphenEdit::INSERT_MAQAF:
|
|
return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF);
|
|
case EndHyphenEdit::INSERT_UCAS_HYPHEN:
|
|
return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN);
|
|
case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN:
|
|
return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN);
|
|
case EndHyphenEdit::NO_EDIT:
|
|
default:
|
|
return EMPTY_HYPHEN_STR;
|
|
}
|
|
}
|
|
#undef MAKE_HYPHEN_STR
|
|
|
|
EndHyphenEdit editForThisLine(HyphenationType type);
|
|
StartHyphenEdit editForNextLine(HyphenationType type);
|
|
|
|
// hyb file header; implementation details are in the .cpp file
|
|
struct Header;
|
|
|
|
class Hyphenator {
|
|
public:
|
|
// Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
|
|
// the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
|
|
// corresponding code unit offset in the word.
|
|
//
|
|
// out must have at least the length of the word capacity.
|
|
//
|
|
// Example: word is "hyphen", result is the following, corresponding to "hy-phen":
|
|
// [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
|
|
void hyphenate(const U16StringPiece& word, HyphenationType* out) const;
|
|
|
|
// Compute the hyphenation of a word.
|
|
//
|
|
// out will be resized to word length.
|
|
void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const {
|
|
out->resize(word.size());
|
|
return hyphenate(word, out->data());
|
|
}
|
|
|
|
// Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
|
|
// immediately after which line breaks are allowed, but words containing it should not be
|
|
// automatically hyphenated.
|
|
static bool isLineBreakingHyphen(uint32_t cp);
|
|
|
|
// pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
|
|
// the caller is responsible for ensuring that the lifetime of the pattern data is
|
|
// at least as long as the Hyphenator object.
|
|
|
|
// This class doesn't copy or take ownership of patternData. Caller must keep the data valid
|
|
// until this instance is deleted.
|
|
// Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
|
|
static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
|
|
const std::string& locale);
|
|
|
|
private:
|
|
enum class HyphenationLocale : uint8_t {
|
|
OTHER = 0,
|
|
CATALAN = 1,
|
|
POLISH = 2,
|
|
SLOVENIAN = 3,
|
|
};
|
|
|
|
// Use Hyphenator::loadBinary instead.
|
|
Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
|
|
HyphenationLocale hyphenLocale);
|
|
|
|
// apply various hyphenation rules including hard and soft hyphens, ignoring patterns
|
|
void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const;
|
|
|
|
// Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
|
|
// Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
|
|
// BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
|
|
// Note that this method writes len+2 entries into alpha_codes (including start and stop)
|
|
HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const;
|
|
|
|
// calculate hyphenation from patterns, assuming alphabet lookup has already been done
|
|
void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
|
|
HyphenationType* out) const;
|
|
|
|
// See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
|
|
// that temporary buffers can be stack-allocated without waste, which is a slightly
|
|
// different use case. It measures UTF-16 code units.
|
|
static const size_t MAX_HYPHENATED_SIZE = 64;
|
|
|
|
const uint8_t* mPatternData;
|
|
const size_t mMinPrefix, mMinSuffix;
|
|
const HyphenationLocale mHyphenationLocale;
|
|
|
|
// accessors for binary data
|
|
const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); }
|
|
};
|
|
|
|
} // namespace minikin
|
|
|
|
#endif // MINIKIN_HYPHENATOR_H
|