343 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			343 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
| // Copyright 2017 The Chromium Authors. All rights reserved.
 | |
| // Use of this source code is governed by a BSD-style license that can be
 | |
| // found in the LICENSE file.
 | |
| 
 | |
| #include "components/zucchini/suffix_array.h"
 | |
| 
 | |
| #include <stddef.h>
 | |
| #include <stdint.h>
 | |
| 
 | |
| #include <algorithm>
 | |
| #include <initializer_list>
 | |
| #include <string>
 | |
| #include <vector>
 | |
| 
 | |
| #include "testing/gtest/include/gtest/gtest.h"
 | |
| 
 | |
| namespace zucchini {
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| using SLType = InducedSuffixSort::SLType;
 | |
| 
 | |
| }  // namespace
 | |
| 
 | |
| using ustring = std::basic_string<unsigned char>;
 | |
| 
 | |
| constexpr uint16_t kNumChar = 256;
 | |
| 
 | |
| ustring MakeUnsignedString(const std::string& str) {
 | |
|   return {str.begin(), str.end()};
 | |
| }
 | |
| 
 | |
| template <class T>
 | |
| std::vector<T> MakeVector(const std::initializer_list<T>& ilist) {
 | |
|   return {ilist.begin(), ilist.end()};
 | |
| }
 | |
| 
 | |
| void TestSlPartition(std::initializer_list<SLType> expected_sl_partition,
 | |
|                      std::initializer_list<size_t> expected_lms_indices,
 | |
|                      std::string str) {
 | |
|   using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
 | |
| 
 | |
|   std::vector<SLType> sl_partition(str.size());
 | |
|   EXPECT_EQ(expected_lms_indices.size(),
 | |
|             SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar,
 | |
|                                        sl_partition.rbegin()));
 | |
|   EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition);
 | |
| 
 | |
|   std::vector<size_t> lms_indices(expected_lms_indices.size());
 | |
|   SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin());
 | |
|   EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices);
 | |
| }
 | |
| 
 | |
| TEST(InducedSuffixSortTest, BuildSLPartition) {
 | |
|   TestSlPartition({}, {}, "");
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::LType,
 | |
|       },
 | |
|       {}, "a");
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::LType,
 | |
|           SLType::LType,
 | |
|       },
 | |
|       {}, "ba");
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::SType,
 | |
|           SLType::LType,
 | |
|       },
 | |
|       {}, "ab");
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::SType,
 | |
|           SLType::SType,
 | |
|           SLType::LType,
 | |
|       },
 | |
|       {}, "aab");
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::LType,
 | |
|           SLType::LType,
 | |
|           SLType::LType,
 | |
|       },
 | |
|       {}, "bba");
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::LType,
 | |
|           SLType::SType,
 | |
|           SLType::LType,
 | |
|       },
 | |
|       {1}, "bab");
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::LType,
 | |
|           SLType::SType,
 | |
|           SLType::SType,
 | |
|           SLType::LType,
 | |
|       },
 | |
|       {1}, "baab");
 | |
| 
 | |
|   TestSlPartition(
 | |
|       {
 | |
|           SLType::LType,  // zucchini
 | |
|           SLType::LType,  // ucchini
 | |
|           SLType::SType,  // cchini
 | |
|           SLType::SType,  // chini
 | |
|           SLType::SType,  // hini
 | |
|           SLType::SType,  // ini
 | |
|           SLType::LType,  // ni
 | |
|           SLType::LType,  // i
 | |
|       },
 | |
|       {2}, "zucchini");
 | |
| }
 | |
| 
 | |
| std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str,
 | |
|                                 uint16_t max_key) {
 | |
|   using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
 | |
|   return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key);
 | |
| }
 | |
| 
 | |
| TEST(InducedSuffixSortTest, BucketCount) {
 | |
|   using vec = std::vector<size_t>;
 | |
| 
 | |
|   EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4));
 | |
|   EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4));
 | |
|   EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4));
 | |
| }
 | |
| 
 | |
| std::vector<size_t> InducedSortSubstring(ustring str) {
 | |
|   using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
 | |
|   std::vector<SLType> sl_partition(str.size());
 | |
|   size_t lms_count = SaisImpl::BuildSLPartition(
 | |
|       str.begin(), str.size(), kNumChar, sl_partition.rbegin());
 | |
|   std::vector<size_t> lms_indices(lms_count);
 | |
|   SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin());
 | |
|   auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar);
 | |
| 
 | |
|   std::vector<size_t> suffix_array(str.size());
 | |
|   SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets,
 | |
|                         suffix_array.begin());
 | |
| 
 | |
|   return suffix_array;
 | |
| }
 | |
| 
 | |
| TEST(InducedSuffixSortTest, InducedSortSubstring) {
 | |
|   using vec = std::vector<size_t>;
 | |
| 
 | |
|   auto us = MakeUnsignedString;
 | |
| 
 | |
|   // L; a$
 | |
|   EXPECT_EQ(vec({0}), InducedSortSubstring(us("a")));
 | |
| 
 | |
|   // SL; ab$, b$
 | |
|   EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab")));
 | |
| 
 | |
|   // LL; a$, ba$
 | |
|   EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba")));
 | |
| 
 | |
|   // SLL; a$, aba$, ba$
 | |
|   EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba")));
 | |
| 
 | |
|   // LSL; ab$, b$, ba
 | |
|   EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab")));
 | |
| 
 | |
|   // SSL; aab$, ab$, b$
 | |
|   EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab")));
 | |
| 
 | |
|   // LSSL; aab$, ab$, b$, ba
 | |
|   EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab")));
 | |
| }
 | |
| 
 | |
| template <class Algorithm>
 | |
| void TestSuffixSort(ustring test_str) {
 | |
|   std::vector<size_t> suffix_array =
 | |
|       MakeSuffixArray<Algorithm>(test_str, kNumChar);
 | |
|   EXPECT_EQ(test_str.size(), suffix_array.size());
 | |
| 
 | |
|   // Expect that I[] is a permutation of [0, len].
 | |
|   std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end());
 | |
|   std::sort(sorted_suffix.begin(), sorted_suffix.end());
 | |
|   for (size_t i = 0; i < test_str.size(); ++i)
 | |
|     EXPECT_EQ(i, sorted_suffix[i]);
 | |
| 
 | |
|   // Expect that all suffixes are strictly ordered.
 | |
|   auto end = test_str.end();
 | |
|   for (size_t i = 1; i < test_str.size(); ++i) {
 | |
|     auto suf1 = test_str.begin() + suffix_array[i - 1];
 | |
|     auto suf2 = test_str.begin() + suffix_array[i];
 | |
|     bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
 | |
|     EXPECT_TRUE(is_less);
 | |
|   }
 | |
| }
 | |
| 
 | |
| constexpr const char* test_strs[] = {
 | |
|     "",
 | |
|     "a",
 | |
|     "aa",
 | |
|     "za",
 | |
|     "CACAO",
 | |
|     "aaaaa",
 | |
|     "banana",
 | |
|     "tobeornottobe",
 | |
|     "The quick brown fox jumps over the lazy dog.",
 | |
|     "elephantelephantelephantelephantelephant",
 | |
|     "walawalawashington",
 | |
|     "-------------------------",
 | |
|     "011010011001011010010110011010010",
 | |
|     "3141592653589793238462643383279502884197169399375105",
 | |
|     "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
 | |
|     "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba",
 | |
|     "0123456789876543210",
 | |
|     "9876543210123456789",
 | |
|     "aababcabcdabcdeabcdefabcdefg",
 | |
|     "asdhklgalksdjghalksdjghalksdjgh",
 | |
| };
 | |
| 
 | |
| TEST(SuffixSortTest, NaiveSuffixSort) {
 | |
|   for (const std::string& test_str : test_strs) {
 | |
|     TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str));
 | |
|   }
 | |
| }
 | |
| 
 | |
| TEST(SuffixSortTest, InducedSuffixSortSort) {
 | |
|   for (const std::string& test_str : test_strs) {
 | |
|     TestSuffixSort<InducedSuffixSort>(MakeUnsignedString(test_str));
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Test with sequence that has every character.
 | |
| TEST(SuffixSortTest, AllChar) {
 | |
|   std::vector<unsigned char> all_char(kNumChar);
 | |
|   std::iota(all_char.begin(), all_char.end(), 0);
 | |
| 
 | |
|   {
 | |
|     std::vector<size_t> suffix_array =
 | |
|         MakeSuffixArray<InducedSuffixSort>(all_char, kNumChar);
 | |
|     for (size_t i = 0; i < kNumChar; ++i)
 | |
|       EXPECT_EQ(i, suffix_array[i]);
 | |
|   }
 | |
| 
 | |
|   std::vector<unsigned char> all_char_reverse(all_char.rbegin(),
 | |
|                                               all_char.rend());
 | |
|   {
 | |
|     std::vector<size_t> suffix_array =
 | |
|         MakeSuffixArray<InducedSuffixSort>(all_char_reverse, kNumChar);
 | |
|     for (size_t i = 0; i < kNumChar; ++i)
 | |
|       EXPECT_EQ(kNumChar - i - 1, suffix_array[i]);
 | |
|   }
 | |
| }
 | |
| 
 | |
| void TestSuffixLowerBound(ustring base_str, ustring search_str) {
 | |
|   std::vector<size_t> suffix_array =
 | |
|       MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar);
 | |
| 
 | |
|   auto pos = SuffixLowerBound(suffix_array, base_str.begin(),
 | |
|                               search_str.begin(), search_str.end());
 | |
| 
 | |
|   auto end = base_str.end();
 | |
|   if (pos != suffix_array.begin()) {
 | |
|     // Previous suffix is less than |search_str|.
 | |
|     auto suf = base_str.begin() + pos[-1];
 | |
|     bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
 | |
|                                                 search_str.end());
 | |
|     EXPECT_TRUE(is_less);
 | |
|   }
 | |
|   if (pos != suffix_array.end()) {
 | |
|     // Current suffix is greater of equal to |search_str|.
 | |
|     auto suf = base_str.begin() + *pos;
 | |
|     bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
 | |
|                                                 search_str.end());
 | |
|     EXPECT_FALSE(is_less);
 | |
|   }
 | |
| }
 | |
| 
 | |
| TEST(SuffixArrayTest, LowerBound) {
 | |
|   auto us = MakeUnsignedString;
 | |
| 
 | |
|   TestSuffixLowerBound(us(""), us(""));
 | |
|   TestSuffixLowerBound(us(""), us("a"));
 | |
|   TestSuffixLowerBound(us("b"), us(""));
 | |
|   TestSuffixLowerBound(us("b"), us("a"));
 | |
|   TestSuffixLowerBound(us("b"), us("c"));
 | |
|   TestSuffixLowerBound(us("b"), us("bc"));
 | |
|   TestSuffixLowerBound(us("aa"), us("a"));
 | |
|   TestSuffixLowerBound(us("aa"), us("aa"));
 | |
| 
 | |
|   ustring sentence = us("the quick brown fox jumps over the lazy dog.");
 | |
|   // Entire string: exact and unique.
 | |
|   TestSuffixLowerBound(sentence, sentence);
 | |
|   // Empty string: exact and non-unique.
 | |
|   TestSuffixLowerBound(sentence, us(""));
 | |
|   // Exact and unique suffix matches.
 | |
|   TestSuffixLowerBound(sentence, us("."));
 | |
|   TestSuffixLowerBound(sentence, us("the lazy dog."));
 | |
|   // Exact and unique non-suffix matches.
 | |
|   TestSuffixLowerBound(sentence, us("quick"));
 | |
|   TestSuffixLowerBound(sentence, us("the quick"));
 | |
|   // Partial and unique matches.
 | |
|   TestSuffixLowerBound(sentence, us("fox jumps with the hosps"));
 | |
|   TestSuffixLowerBound(sentence, us("xyz"));
 | |
|   // Exact and non-unique match: take lexicographical first.
 | |
|   TestSuffixLowerBound(sentence, us("the"));
 | |
|   TestSuffixLowerBound(sentence, us(" "));
 | |
|   // Partial and non-unique match.
 | |
|   // query      < "the l"... < "the q"...
 | |
|   TestSuffixLowerBound(sentence, us("the apple"));
 | |
|   // "the l"... < query      < "the q"...
 | |
|   TestSuffixLowerBound(sentence, us("the opera"));
 | |
|   // "the l"... < "the q"... < query
 | |
|   TestSuffixLowerBound(sentence, us("the zebra"));
 | |
|   // Prefix match dominates suffix match (unique).
 | |
|   TestSuffixLowerBound(sentence, us("over quick brown fox"));
 | |
|   // Empty matchs.
 | |
|   TestSuffixLowerBound(sentence, us(","));
 | |
|   TestSuffixLowerBound(sentence, us("1234"));
 | |
|   TestSuffixLowerBound(sentence, us("THE QUICK BROWN FOX"));
 | |
|   TestSuffixLowerBound(sentence, us("(the"));
 | |
| }
 | |
| 
 | |
| TEST(SuffixArrayTest, LowerBoundExact) {
 | |
|   for (const std::string& test_str : test_strs) {
 | |
|     ustring test_ustr = MakeUnsignedString(test_str);
 | |
| 
 | |
|     std::vector<size_t> suffix_array =
 | |
|         MakeSuffixArray<InducedSuffixSort>(test_ustr, kNumChar);
 | |
| 
 | |
|     for (size_t lo = 0; lo < test_str.size(); ++lo) {
 | |
|       for (size_t hi = lo + 1; hi <= test_str.size(); ++hi) {
 | |
|         ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi);
 | |
|         ASSERT_EQ(query.size(), hi - lo);
 | |
|         auto pos = SuffixLowerBound(suffix_array, test_ustr.begin(),
 | |
|                                     query.begin(), query.end());
 | |
|         EXPECT_TRUE(
 | |
|             std::equal(query.begin(), query.end(), test_ustr.begin() + *pos));
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| }  // namespace zucchini
 |