android13/external/libtextclassifier/native/utils/normalization_test.cc

122 lines
4.4 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/normalization.h"
#include <string>
#include "utils/base/integral_types.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace libtextclassifier3 {
namespace {
using testing::Eq;
class NormalizationTest : public testing::Test {
protected:
NormalizationTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
std::string NormalizeTextCodepointWise(const std::string& text,
const int32 codepointwise_ops) {
return libtextclassifier3::NormalizeTextCodepointWise(
unilib_, codepointwise_ops,
UTF8ToUnicodeText(text, /*do_copy=*/false))
.ToUTF8String();
}
UniLib unilib_;
};
TEST_F(NormalizationTest, ReturnsIdenticalStringWhenNoNormalization) {
EXPECT_THAT(NormalizeTextCodepointWise(
"Never gonna let you down.",
NormalizationOptions_::CodepointwiseNormalizationOp_NONE),
Eq("Never gonna let you down."));
}
#if !defined(TC3_UNILIB_DUMMY)
TEST_F(NormalizationTest, DropsWhitespace) {
EXPECT_THAT(
NormalizeTextCodepointWise(
"Never gonna let you down.",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
Eq("Nevergonnaletyoudown."));
EXPECT_THAT(
NormalizeTextCodepointWise(
"Never\tgonna\t\tlet\tyou\tdown.",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
Eq("Nevergonnaletyoudown."));
EXPECT_THAT(
NormalizeTextCodepointWise(
"Never\u2003gonna\u2003let\u2003you\u2003down.",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
Eq("Nevergonnaletyoudown."));
}
TEST_F(NormalizationTest, DropsPunctuation) {
EXPECT_THAT(
NormalizeTextCodepointWise(
"Never gonna let you down.",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
Eq("Never gonna let you down"));
EXPECT_THAT(
NormalizeTextCodepointWise(
"αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
Eq("αʹ Σημεῖόν ἐστιν οὗ μέρος οὐθέν"));
EXPECT_THAT(
NormalizeTextCodepointWise(
"978—3—16—148410—0",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
Eq("9783161484100"));
}
TEST_F(NormalizationTest, LowercasesUnicodeText) {
EXPECT_THAT(
NormalizeTextCodepointWise(
"αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
Eq("αʹ. σημεῖόν ἐστιν, οὗ μέρος οὐθέν."));
EXPECT_THAT(
NormalizeTextCodepointWise(
"αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
Eq("αʹ.σημεῖόνἐστιν,οὗμέροςοὐθέν."));
}
TEST_F(NormalizationTest, UppercasesUnicodeText) {
EXPECT_THAT(
NormalizeTextCodepointWise(
"Κανένας άνθρωπος δεν ξέρει",
NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
Eq("ΚΑΝΈΝΑΣ ΆΝΘΡΩΠΟΣ ΔΕΝ ΞΈΡΕΙ"));
EXPECT_THAT(
NormalizeTextCodepointWise(
"Κανένας άνθρωπος δεν ξέρει",
NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
Eq("ΚΑΝΈΝΑΣΆΝΘΡΩΠΟΣΔΕΝΞΈΡΕΙ"));
}
#endif
} // namespace
} // namespace libtextclassifier3