140 lines
4.3 KiB
C++
140 lines
4.3 KiB
C++
// Copyright (C) 2021 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "icing/tokenization/verbatim-tokenizer.h"
|
|
|
|
#include "icing/text_classifier/lib3/utils/base/statusor.h"
|
|
#include "icing/util/character-iterator.h"
|
|
#include "icing/util/status-macros.h"
|
|
|
|
namespace icing {
|
|
namespace lib {
|
|
|
|
class VerbatimTokenIterator : public Tokenizer::Iterator {
|
|
public:
|
|
explicit VerbatimTokenIterator(std::string_view text)
|
|
: term_(std::move(text)) {}
|
|
|
|
bool Advance() override {
|
|
if (term_.empty() || has_advanced_to_end_) {
|
|
return false;
|
|
}
|
|
|
|
has_advanced_to_end_ = true;
|
|
return true;
|
|
}
|
|
|
|
Token GetToken() const override {
|
|
if (term_.empty() || !has_advanced_to_end_) {
|
|
return Token(Token::Type::INVALID);
|
|
}
|
|
|
|
return Token(Token::Type::VERBATIM, term_);
|
|
}
|
|
|
|
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
|
|
override {
|
|
if (term_.empty()) {
|
|
return absl_ports::AbortedError(
|
|
"Could not calculate start of empty token.");
|
|
}
|
|
|
|
return CharacterIterator(term_, 0, 0, 0);
|
|
}
|
|
|
|
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
|
|
override {
|
|
if (term_.empty()) {
|
|
return absl_ports::AbortedError(
|
|
"Could not calculate end of empty token.");
|
|
}
|
|
|
|
if (token_end_iterator_.utf8_index() >= 0) {
|
|
return token_end_iterator_;
|
|
}
|
|
|
|
bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
|
|
if (moved_to_token_end) {
|
|
return token_end_iterator_;
|
|
} else {
|
|
return absl_ports::AbortedError("Could not move to end of token.");
|
|
}
|
|
}
|
|
|
|
bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
|
|
// We can only reset to the sole verbatim token, so we must have a negative
|
|
// offset for it to be considered the token after.
|
|
if (utf32_offset < 0) {
|
|
// Because we are now at the sole verbatim token, we should ensure we can
|
|
// no longer advance past it.
|
|
has_advanced_to_end_ = true;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
|
|
// We can only reset to the sole verbatim token, so we must have an offset
|
|
// after the end of the token for the reset to be valid. This means the
|
|
// provided utf-32 offset must be equal to or greater than the utf-32 length
|
|
// of the token.
|
|
if (token_end_iterator_.utf8_index() < 0) {
|
|
// Moves one index past the end of the term.
|
|
bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
|
|
if (!moved_to_token_end) {
|
|
// We're unable to reset as we failed to move to the end of the term.
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (utf32_offset >= token_end_iterator_.utf32_index()) {
|
|
// Because we are now at the sole verbatim token, we should ensure we can
|
|
// no longer advance past it.
|
|
has_advanced_to_end_ = true;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool ResetToStart() override {
|
|
has_advanced_to_end_ = true;
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
std::string_view term_;
|
|
CharacterIterator token_end_iterator_ = CharacterIterator(term_, -1, -1, -1);
|
|
// Used to determine whether we have advanced on the sole verbatim token
|
|
bool has_advanced_to_end_ = false;
|
|
};
|
|
|
|
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
|
|
VerbatimTokenizer::Tokenize(std::string_view text) const {
|
|
return std::make_unique<VerbatimTokenIterator>(text);
|
|
}
|
|
|
|
libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll(
|
|
std::string_view text) const {
|
|
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
|
|
Tokenize(text));
|
|
std::vector<Token> tokens;
|
|
while (iterator->Advance()) {
|
|
tokens.push_back(iterator->GetToken());
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
} // namespace lib
|
|
} // namespace icing
|