133 lines
5.2 KiB
C++
133 lines
5.2 KiB
C++
// Copyright 2017 The Chromium Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#include "components/zucchini/binary_data_histogram.h"
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
#include "components/zucchini/buffer_view.h"
|
|
#include "testing/gtest/include/gtest/gtest.h"
|
|
|
|
namespace zucchini {
|
|
|
|
TEST(OutlierDetectorTest, Basic) {
|
|
auto make_detector = [](const std::vector<double>& values) {
|
|
auto detector = std::make_unique<OutlierDetector>();
|
|
for (double v : values)
|
|
detector->Add(v);
|
|
detector->Prepare();
|
|
return detector;
|
|
};
|
|
|
|
std::unique_ptr<OutlierDetector> detector;
|
|
// No data: Should at least not cause error.
|
|
detector = make_detector({});
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.0));
|
|
// Single point: Trivially inert.
|
|
detector = make_detector({0.5});
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.1));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.5));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.9));
|
|
// Two identical points: StdDev is 0, so falls back to built-in tolerance.
|
|
detector = make_detector({0.5, 0.5});
|
|
EXPECT_EQ(-1, detector->DecideOutlier(0.3));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.499));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.5));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.501));
|
|
EXPECT_EQ(1, detector->DecideOutlier(0.7));
|
|
// Two separate points: Outliner test is pretty lax.
|
|
detector = make_detector({0.4, 0.6});
|
|
EXPECT_EQ(-1, detector->DecideOutlier(0.2));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.3));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.5));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.7));
|
|
EXPECT_EQ(1, detector->DecideOutlier(0.8));
|
|
// Sharpen distribution by clustering toward norm: Now test is stricter.
|
|
detector = make_detector({0.4, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.6});
|
|
EXPECT_EQ(-1, detector->DecideOutlier(0.3));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.4));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.5));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.6));
|
|
EXPECT_EQ(1, detector->DecideOutlier(0.7));
|
|
// Shift numbers around: Mean is 0.3, and data order scrambled.
|
|
detector = make_detector({0.28, 0.2, 0.31, 0.4, 0.29, 0.32, 0.27, 0.30});
|
|
EXPECT_EQ(-1, detector->DecideOutlier(0.0));
|
|
EXPECT_EQ(-1, detector->DecideOutlier(0.1));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.2));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.3));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.4));
|
|
EXPECT_EQ(1, detector->DecideOutlier(0.5));
|
|
EXPECT_EQ(1, detector->DecideOutlier(1.0));
|
|
// Typical usage: Potential outlier would be part of original input data!
|
|
detector = make_detector({0.3, 0.29, 0.31, 0.0, 0.3, 0.32, 0.3, 0.29, 0.6});
|
|
EXPECT_EQ(-1, detector->DecideOutlier(0.0));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.28));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.29));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.3));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.31));
|
|
EXPECT_EQ(0, detector->DecideOutlier(0.32));
|
|
EXPECT_EQ(1, detector->DecideOutlier(0.6));
|
|
}
|
|
|
|
TEST(BinaryDataHistogramTest, Basic) {
|
|
constexpr double kUninitScore = -1;
|
|
|
|
constexpr uint8_t kTestData[] = {2, 137, 42, 0, 0, 0, 7, 11, 1, 11, 255};
|
|
const size_t n = sizeof(kTestData);
|
|
ConstBufferView region(kTestData, n);
|
|
|
|
std::vector<BinaryDataHistogram> prefix_histograms(n + 1); // Short to long.
|
|
std::vector<BinaryDataHistogram> suffix_histograms(n + 1); // Long to short.
|
|
|
|
for (size_t i = 0; i <= n; ++i) {
|
|
ConstBufferView prefix(region.begin(), i);
|
|
ConstBufferView suffix(region.begin() + i, n - i);
|
|
// If regions are smaller than 2 bytes then it is invalid. Else valid.
|
|
EXPECT_EQ(prefix.size() >= 2, prefix_histograms[i].Compute(prefix));
|
|
EXPECT_EQ(suffix.size() >= 2, suffix_histograms[i].Compute(suffix));
|
|
// IsValid() returns the same results.
|
|
EXPECT_EQ(prefix.size() >= 2, prefix_histograms[i].IsValid());
|
|
EXPECT_EQ(suffix.size() >= 2, suffix_histograms[i].IsValid());
|
|
}
|
|
|
|
// Full-prefix = full-suffix = full data.
|
|
EXPECT_EQ(0.0, prefix_histograms[n].Distance(suffix_histograms[0]));
|
|
EXPECT_EQ(0.0, suffix_histograms[0].Distance(prefix_histograms[n]));
|
|
|
|
// Testing heuristics without overreliance on implementation details.
|
|
|
|
// Strict prefixes, in increasing size. Compare against full data.
|
|
double prev_prefix_score = kUninitScore;
|
|
for (size_t i = 2; i < n; ++i) {
|
|
double score = prefix_histograms[i].Distance(prefix_histograms[n]);
|
|
// Positivity.
|
|
EXPECT_GT(score, 0.0);
|
|
// Symmetry.
|
|
EXPECT_EQ(score, prefix_histograms[n].Distance(prefix_histograms[i]));
|
|
// Distance should decrease as prefix gets nearer to full data.
|
|
if (prev_prefix_score != kUninitScore)
|
|
EXPECT_LT(score, prev_prefix_score);
|
|
prev_prefix_score = score;
|
|
}
|
|
|
|
// Strict suffixes, in decreasing size. Compare against full data.
|
|
double prev_suffix_score = -1;
|
|
for (size_t i = 1; i <= n - 2; ++i) {
|
|
double score = suffix_histograms[i].Distance(suffix_histograms[0]);
|
|
// Positivity.
|
|
EXPECT_GT(score, 0.0);
|
|
// Symmetry.
|
|
EXPECT_EQ(score, suffix_histograms[0].Distance(suffix_histograms[i]));
|
|
// Distance should increase as suffix gets farther from full data.
|
|
if (prev_suffix_score != kUninitScore)
|
|
EXPECT_GT(score, prev_suffix_score);
|
|
prev_suffix_score = score;
|
|
}
|
|
}
|
|
|
|
} // namespace zucchini
|