android13/external/icing/icing/tokenization/verbatim-tokenizer.cc

140 lines
4.3 KiB
C++

// Copyright (C) 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/tokenization/verbatim-tokenizer.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/util/character-iterator.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
class VerbatimTokenIterator : public Tokenizer::Iterator {
public:
explicit VerbatimTokenIterator(std::string_view text)
: term_(std::move(text)) {}
bool Advance() override {
if (term_.empty() || has_advanced_to_end_) {
return false;
}
has_advanced_to_end_ = true;
return true;
}
Token GetToken() const override {
if (term_.empty() || !has_advanced_to_end_) {
return Token(Token::Type::INVALID);
}
return Token(Token::Type::VERBATIM, term_);
}
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
override {
if (term_.empty()) {
return absl_ports::AbortedError(
"Could not calculate start of empty token.");
}
return CharacterIterator(term_, 0, 0, 0);
}
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
override {
if (term_.empty()) {
return absl_ports::AbortedError(
"Could not calculate end of empty token.");
}
if (token_end_iterator_.utf8_index() >= 0) {
return token_end_iterator_;
}
bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
if (moved_to_token_end) {
return token_end_iterator_;
} else {
return absl_ports::AbortedError("Could not move to end of token.");
}
}
bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
// We can only reset to the sole verbatim token, so we must have a negative
// offset for it to be considered the token after.
if (utf32_offset < 0) {
// Because we are now at the sole verbatim token, we should ensure we can
// no longer advance past it.
has_advanced_to_end_ = true;
return true;
}
return false;
}
bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
// We can only reset to the sole verbatim token, so we must have an offset
// after the end of the token for the reset to be valid. This means the
// provided utf-32 offset must be equal to or greater than the utf-32 length
// of the token.
if (token_end_iterator_.utf8_index() < 0) {
// Moves one index past the end of the term.
bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
if (!moved_to_token_end) {
// We're unable to reset as we failed to move to the end of the term.
return false;
}
}
if (utf32_offset >= token_end_iterator_.utf32_index()) {
// Because we are now at the sole verbatim token, we should ensure we can
// no longer advance past it.
has_advanced_to_end_ = true;
return true;
}
return false;
}
bool ResetToStart() override {
has_advanced_to_end_ = true;
return true;
}
private:
std::string_view term_;
CharacterIterator token_end_iterator_ = CharacterIterator(term_, -1, -1, -1);
// Used to determine whether we have advanced on the sole verbatim token
bool has_advanced_to_end_ = false;
};
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
VerbatimTokenizer::Tokenize(std::string_view text) const {
return std::make_unique<VerbatimTokenIterator>(text);
}
libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll(
std::string_view text) const {
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
Tokenize(text));
std::vector<Token> tokens;
while (iterator->Advance()) {
tokens.push_back(iterator->GetToken());
}
return tokens;
}
} // namespace lib
} // namespace icing