476 lines
20 KiB
C++
476 lines
20 KiB
C++
// Copyright 2017 The Chromium Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
|
|
#define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
|
|
|
|
#include <algorithm>
|
|
#include <iterator>
|
|
#include <numeric>
|
|
#include <vector>
|
|
|
|
#include "base/check.h"
|
|
|
|
namespace zucchini {
|
|
|
|
// A functor class that implements the naive suffix sorting algorithm that uses
|
|
// std::sort with lexicographical compare. This is only meant as reference of
|
|
// the interface.
|
|
class NaiveSuffixSort {
|
|
public:
|
|
// Type requirements:
|
|
// |InputRng| is an input random access range.
|
|
// |KeyType| is an unsigned integer type.
|
|
// |SAIt| is a random access iterator with mutable references.
|
|
template <class InputRng, class KeyType, class SAIt>
|
|
// |str| is the input string on which suffix sort is applied.
|
|
// Characters found in |str| must be in the range [0, |key_bound|)
|
|
// |suffix_array| is the beginning of the destination range, which is at least
|
|
// as large as |str|.
|
|
void operator()(const InputRng& str,
|
|
KeyType key_bound,
|
|
SAIt suffix_array) const {
|
|
using size_type = typename SAIt::value_type;
|
|
|
|
size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
|
|
|
|
// |suffix_array| is first filled with ordered indices of |str|.
|
|
// Those indices are then sorted with lexicographical comparisons in |str|.
|
|
std::iota(suffix_array, suffix_array + n, 0);
|
|
std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
|
|
return std::lexicographical_compare(std::begin(str) + i, std::end(str),
|
|
std::begin(str) + j, std::end(str));
|
|
});
|
|
}
|
|
};
|
|
|
|
// A functor class that implements suffix array induced sorting (SA-IS)
|
|
// algorithm with linear time and memory complexity,
|
|
// see http://ieeexplore.ieee.org/abstract/document/5582081/
|
|
class InducedSuffixSort {
|
|
public:
|
|
// Type requirements:
|
|
// |InputRng| is an input random access range.
|
|
// |KeyType| is an unsigned integer type.
|
|
// |SAIt| is a random access iterator with mutable values.
|
|
template <class InputRng, class KeyType, class SAIt>
|
|
// |str| is the input string on which suffix sort is applied.
|
|
// Characters found in |str| must be in the range [0, |key_bound|)
|
|
// |suffix_array| is the beginning of the destination range, which is at least
|
|
// as large as |str|.
|
|
void operator()(const InputRng& str,
|
|
KeyType key_bound,
|
|
SAIt suffix_array) const {
|
|
using value_type = typename InputRng::value_type;
|
|
using size_type = typename SAIt::value_type;
|
|
|
|
static_assert(std::is_unsigned<value_type>::value,
|
|
"SA-IS only supports input string with unsigned values");
|
|
static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
|
|
|
|
size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
|
|
|
|
Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
|
|
key_bound, suffix_array);
|
|
}
|
|
|
|
// Given string S of length n. We assume S is terminated by a unique sentinel
|
|
// $, which is considered as the smallest character. This sentinel does not
|
|
// exist in memory and is only treated implicitly, hence |n| does not count
|
|
// the sentinel in this implementation. We denote suf(S,i) the suffix formed
|
|
// by S[i..n).
|
|
|
|
// A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
|
|
// or suf(S,i) > suf(S,i+1), respectively.
|
|
enum SLType : bool { SType, LType };
|
|
|
|
// A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
|
|
// S-type or L-type, respectively.
|
|
|
|
// A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
|
|
// S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
|
|
// character.
|
|
|
|
// A substring S[i..j) is an LMS-substring if
|
|
// (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
|
|
// LMS characters, or
|
|
// (2) S[i..j) is the sentinel $.
|
|
|
|
template <class SizeType, class KeyType>
|
|
struct Implementation {
|
|
static_assert(std::is_unsigned<SizeType>::value,
|
|
"SizeType must be unsigned");
|
|
static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
|
|
using size_type = SizeType;
|
|
using key_type = KeyType;
|
|
|
|
using iterator = typename std::vector<size_type>::iterator;
|
|
using const_iterator = typename std::vector<size_type>::const_iterator;
|
|
|
|
// Partition every suffix based on SL-type. Returns the number of LMS
|
|
// suffixes.
|
|
template <class StrIt>
|
|
static size_type BuildSLPartition(
|
|
StrIt str,
|
|
size_type length,
|
|
key_type key_bound,
|
|
std::vector<SLType>::reverse_iterator sl_partition_it) {
|
|
// We will count LMS suffixes (S to L-type or last S-type).
|
|
size_type lms_count = 0;
|
|
|
|
// |previous_type| is initialized to L-type to avoid counting an extra
|
|
// LMS suffix at the end
|
|
SLType previous_type = LType;
|
|
|
|
// Initialized to dummy, impossible key.
|
|
key_type previous_key = key_bound;
|
|
|
|
// We're travelling backward to determine the partition,
|
|
// as if we prepend one character at a time to the string, ex:
|
|
// b$ is L-type because b > $.
|
|
// ab$ is S-type because a < b, implying ab$ < b$.
|
|
// bab$ is L-type because b > a, implying bab$ > ab$.
|
|
// bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
|
|
for (auto str_it = std::reverse_iterator<StrIt>(str + length);
|
|
str_it != std::reverse_iterator<StrIt>(str);
|
|
++str_it, ++sl_partition_it) {
|
|
key_type current_key = *str_it;
|
|
|
|
if (current_key > previous_key || previous_key == key_bound) {
|
|
// S[i] > S[i + 1] or S[i] is last character.
|
|
if (previous_type == SType)
|
|
// suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
|
|
// suf(S,i+1) was a LMS suffix.
|
|
++lms_count;
|
|
|
|
previous_type = LType; // For next round.
|
|
} else if (current_key < previous_key) {
|
|
// S[i] < S[i + 1]
|
|
previous_type = SType; // For next round.
|
|
}
|
|
// Else, S[i] == S[i + 1]:
|
|
// The next character that differs determines the SL-type,
|
|
// so we reuse the last seen type.
|
|
|
|
*sl_partition_it = previous_type;
|
|
previous_key = current_key; // For next round.
|
|
}
|
|
|
|
return lms_count;
|
|
}
|
|
|
|
// Find indices of LMS suffixes and write result to |lms_indices|.
|
|
static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
|
|
iterator lms_indices) {
|
|
// |previous_type| is initialized to S-type to avoid counting an extra
|
|
// LMS suffix at the beginning
|
|
SLType previous_type = SType;
|
|
for (size_type i = 0; i < sl_partition.size(); ++i) {
|
|
if (sl_partition[i] == SType && previous_type == LType)
|
|
*lms_indices++ = i;
|
|
previous_type = sl_partition[i];
|
|
}
|
|
}
|
|
|
|
template <class StrIt>
|
|
static std::vector<size_type> MakeBucketCount(StrIt str,
|
|
size_type length,
|
|
key_type key_bound) {
|
|
// Occurrence of every unique character is counted in |buckets|
|
|
std::vector<size_type> buckets(static_cast<size_type>(key_bound));
|
|
|
|
for (auto it = str; it != str + length; ++it)
|
|
++buckets[*it];
|
|
return buckets;
|
|
}
|
|
|
|
// Apply induced sort from |lms_indices| to |suffix_array| associated with
|
|
// the string |str|.
|
|
template <class StrIt, class SAIt>
|
|
static void InducedSort(StrIt str,
|
|
size_type length,
|
|
const std::vector<SLType>& sl_partition,
|
|
const std::vector<size_type>& lms_indices,
|
|
const std::vector<size_type>& buckets,
|
|
SAIt suffix_array) {
|
|
// All indices are first marked as unset with the illegal value |length|.
|
|
std::fill(suffix_array, suffix_array + length, length);
|
|
|
|
// Used to mark bucket boundaries (head or end) as indices in str.
|
|
DCHECK(!buckets.empty());
|
|
std::vector<size_type> bucket_bounds(buckets.size());
|
|
|
|
// Step 1: Assign indices for LMS suffixes, populating the end of
|
|
// respective buckets but keeping relative order.
|
|
|
|
// Find the end of each bucket and write it to |bucket_bounds|.
|
|
std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
|
|
|
|
// Process each |lms_indices| backward, and assign them to the end of
|
|
// their respective buckets, so relative order is preserved.
|
|
for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
|
|
key_type key = str[*it];
|
|
suffix_array[--bucket_bounds[key]] = *it;
|
|
}
|
|
|
|
// Step 2
|
|
// Scan forward |suffix_array|; for each modified suf(S,i) for which
|
|
// suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
|
|
// head of the corresponding bucket and forward the bucket head to the
|
|
// right.
|
|
|
|
// Find the head of each bucket and write it to |bucket_bounds|. Since
|
|
// only LMS suffixes where inserted in |suffix_array| during Step 1,
|
|
// |bucket_bounds| does not contains the head of each bucket and needs to
|
|
// be updated.
|
|
bucket_bounds[0] = 0;
|
|
std::partial_sum(buckets.begin(), buckets.end() - 1,
|
|
bucket_bounds.begin() + 1);
|
|
|
|
// From Step 1, the sentinel $, which we treat implicitly, would have
|
|
// been placed at the beginning of |suffix_array|, since $ is always
|
|
// considered as the smallest character. We then have to deal with the
|
|
// previous (last) suffix.
|
|
if (sl_partition[length - 1] == LType) {
|
|
key_type key = str[length - 1];
|
|
suffix_array[bucket_bounds[key]++] = length - 1;
|
|
}
|
|
for (auto it = suffix_array; it != suffix_array + length; ++it) {
|
|
size_type suffix_index = *it;
|
|
|
|
// While the original algorithm marks unset suffixes with -1,
|
|
// we found that marking them with |length| is also possible and more
|
|
// convenient because we are working with unsigned integers.
|
|
if (suffix_index != length && suffix_index > 0 &&
|
|
sl_partition[--suffix_index] == LType) {
|
|
key_type key = str[suffix_index];
|
|
suffix_array[bucket_bounds[key]++] = suffix_index;
|
|
}
|
|
}
|
|
|
|
// Step 3
|
|
// Scan backward |suffix_array|; for each modified suf(S, i) for which
|
|
// suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
|
|
// end of the corresponding bucket and forward the bucket head to the
|
|
// left.
|
|
|
|
// Find the end of each bucket and write it to |bucket_bounds|. Since
|
|
// only L-type suffixes where inserted in |suffix_array| during Step 2,
|
|
// |bucket_bounds| does not contain the end of each bucket and needs to
|
|
// be updated.
|
|
std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
|
|
|
|
for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
|
|
it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
|
|
size_type suffix_index = *it;
|
|
if (suffix_index != length && suffix_index > 0 &&
|
|
sl_partition[--suffix_index] == SType) {
|
|
key_type key = str[suffix_index];
|
|
suffix_array[--bucket_bounds[key]] = suffix_index;
|
|
}
|
|
}
|
|
// Deals with the last suffix, because of the sentinel.
|
|
if (sl_partition[length - 1] == SType) {
|
|
key_type key = str[length - 1];
|
|
suffix_array[--bucket_bounds[key]] = length - 1;
|
|
}
|
|
}
|
|
|
|
// Given a string S starting at |str| with length |length|, an array
|
|
// starting at |substring_array| containing lexicographically ordered LMS
|
|
// terminated substring indices of S and an SL-Type partition |sl_partition|
|
|
// of S, assigns a unique label to every unique LMS substring. The sorted
|
|
// labels for all LMS substrings are written to |lms_str|, while the indices
|
|
// of LMS suffixes are written to |lms_indices|. In addition, returns the
|
|
// total number of unique labels.
|
|
template <class StrIt, class SAIt>
|
|
static size_type LabelLmsSubstrings(StrIt str,
|
|
size_type length,
|
|
const std::vector<SLType>& sl_partition,
|
|
SAIt suffix_array,
|
|
iterator lms_indices,
|
|
iterator lms_str) {
|
|
// Labelling starts at 0.
|
|
size_type label = 0;
|
|
|
|
// |previous_lms| is initialized to 0 to indicate it is unset.
|
|
// Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
|
|
// lexicographical order.
|
|
size_type previous_lms = 0;
|
|
for (auto it = suffix_array; it != suffix_array + length; ++it) {
|
|
if (*it > 0 && sl_partition[*it] == SType &&
|
|
sl_partition[*it - 1] == LType) {
|
|
// suf(S, *it) is a LMS suffix.
|
|
|
|
size_type current_lms = *it;
|
|
if (previous_lms != 0) {
|
|
// There was a previous LMS suffix. Check if the current LMS
|
|
// substring is equal to the previous one.
|
|
SLType current_lms_type = SType;
|
|
SLType previous_lms_type = SType;
|
|
for (size_type k = 0;; ++k) {
|
|
// |current_lms_end| and |previous_lms_end| denote whether we have
|
|
// reached the end of the current and previous LMS substring,
|
|
// respectively
|
|
bool current_lms_end = false;
|
|
bool previous_lms_end = false;
|
|
|
|
// Check for both previous and current substring ends.
|
|
// Note that it is more convenient to check if
|
|
// suf(S,current_lms + k) is an LMS suffix than to retrieve it
|
|
// from lms_indices.
|
|
if (current_lms + k >= length ||
|
|
(current_lms_type == LType &&
|
|
sl_partition[current_lms + k] == SType)) {
|
|
current_lms_end = true;
|
|
}
|
|
if (previous_lms + k >= length ||
|
|
(previous_lms_type == LType &&
|
|
sl_partition[previous_lms + k] == SType)) {
|
|
previous_lms_end = true;
|
|
}
|
|
|
|
if (current_lms_end && previous_lms_end) {
|
|
break; // Previous and current substrings are identical.
|
|
} else if (current_lms_end != previous_lms_end ||
|
|
str[current_lms + k] != str[previous_lms + k]) {
|
|
// Previous and current substrings differ, a new label is used.
|
|
++label;
|
|
break;
|
|
}
|
|
|
|
current_lms_type = sl_partition[current_lms + k];
|
|
previous_lms_type = sl_partition[previous_lms + k];
|
|
}
|
|
}
|
|
*lms_indices++ = *it;
|
|
*lms_str++ = label;
|
|
previous_lms = current_lms;
|
|
}
|
|
}
|
|
|
|
return label + 1;
|
|
}
|
|
|
|
// Implementation of the SA-IS algorithm. |str| must be a random access
|
|
// iterator pointing at the beginning of S with length |length|. The result
|
|
// is writtend in |suffix_array|, a random access iterator.
|
|
template <class StrIt, class SAIt>
|
|
static void SuffixSort(StrIt str,
|
|
size_type length,
|
|
key_type key_bound,
|
|
SAIt suffix_array) {
|
|
if (length == 1)
|
|
*suffix_array = 0;
|
|
if (length < 2)
|
|
return;
|
|
|
|
std::vector<SLType> sl_partition(length);
|
|
size_type lms_count =
|
|
BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
|
|
std::vector<size_type> lms_indices(lms_count);
|
|
FindLmsSuffixes(sl_partition, lms_indices.begin());
|
|
std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
|
|
|
|
if (lms_indices.size() > 1) {
|
|
// Given |lms_indices| in the same order they appear in |str|, induce
|
|
// LMS substrings relative order and write result to |suffix_array|.
|
|
InducedSort(str, length, sl_partition, lms_indices, buckets,
|
|
suffix_array);
|
|
std::vector<size_type> lms_str(lms_indices.size());
|
|
|
|
// Given LMS substrings in relative order found in |suffix_array|,
|
|
// map LMS substrings to unique labels to form a new string, |lms_str|.
|
|
size_type label_count =
|
|
LabelLmsSubstrings(str, length, sl_partition, suffix_array,
|
|
lms_indices.begin(), lms_str.begin());
|
|
|
|
if (label_count < lms_str.size()) {
|
|
// Reorder |lms_str| to have LMS suffixes in the same order they
|
|
// appear in |str|.
|
|
for (size_type i = 0; i < lms_indices.size(); ++i)
|
|
suffix_array[lms_indices[i]] = lms_str[i];
|
|
|
|
SLType previous_type = SType;
|
|
for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
|
|
if (sl_partition[i] == SType && previous_type == LType) {
|
|
lms_str[j] = suffix_array[i];
|
|
lms_indices[j++] = i;
|
|
}
|
|
previous_type = sl_partition[i];
|
|
}
|
|
|
|
// Recursively apply SuffixSort on |lms_str|, which is formed from
|
|
// labeled LMS suffixes in the same order they appear in |str|.
|
|
// Note that |KeyType| will be size_type because |lms_str| contains
|
|
// indices. |lms_str| is at most half the length of |str|.
|
|
Implementation<size_type, size_type>::SuffixSort(
|
|
lms_str.begin(), static_cast<size_type>(lms_str.size()),
|
|
label_count, suffix_array);
|
|
|
|
// Map LMS labels back to indices in |str| and write result to
|
|
// |lms_indices|. We're using |suffix_array| as a temporary buffer.
|
|
for (size_type i = 0; i < lms_indices.size(); ++i)
|
|
suffix_array[i] = lms_indices[suffix_array[i]];
|
|
std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());
|
|
|
|
// At this point, |lms_indices| contains sorted LMS suffixes of |str|.
|
|
}
|
|
}
|
|
// Given |lms_indices| where LMS suffixes are sorted, induce the full
|
|
// order of suffixes in |str|.
|
|
InducedSort(str, length, sl_partition, lms_indices, buckets,
|
|
suffix_array);
|
|
}
|
|
|
|
Implementation() = delete;
|
|
Implementation(const Implementation&) = delete;
|
|
const Implementation& operator=(const Implementation&) = delete;
|
|
};
|
|
};
|
|
|
|
// Generates a sorted suffix array for the input string |str| using the functor
|
|
// |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
|
|
/// Characters found in |str| are assumed to be in range [0, |key_bound|).
|
|
// Returns the suffix array as a vector.
|
|
// |StrRng| is an input random access range.
|
|
// |KeyType| is an unsigned integer type.
|
|
template <class Algorithm, class StrRng, class KeyType>
|
|
std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
|
|
KeyType key_bound) {
|
|
Algorithm sort;
|
|
std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
|
|
sort(str, key_bound, suffix_array.begin());
|
|
return suffix_array;
|
|
}
|
|
|
|
// Type requirements:
|
|
// |SARng| is an input random access range.
|
|
// |StrIt1| is a random access iterator.
|
|
// |StrIt2| is a forward iterator.
|
|
template <class SARng, class StrIt1, class StrIt2>
|
|
// Lexicographical lower bound using binary search for
|
|
// [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
|
|
// starting at |str1_first|. This does not necessarily return the index of
|
|
// the longest matching substring.
|
|
auto SuffixLowerBound(const SARng& suffix_array,
|
|
StrIt1 str1_first,
|
|
StrIt2 str2_first,
|
|
StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
|
|
using size_type = typename SARng::value_type;
|
|
|
|
size_t n = std::end(suffix_array) - std::begin(suffix_array);
|
|
auto it = std::lower_bound(
|
|
std::begin(suffix_array), std::end(suffix_array), str2_first,
|
|
[str1_first, str2_last, n](size_type a, StrIt2 b) {
|
|
return std::lexicographical_compare(str1_first + a, str1_first + n, b,
|
|
str2_last);
|
|
});
|
|
return it;
|
|
}
|
|
|
|
} // namespace zucchini
|
|
|
|
#endif // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
|