370 lines
13 KiB
C++
370 lines
13 KiB
C++
|
// Copyright 2017 The Chromium Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
#include "components/zucchini/heuristic_ensemble_matcher.h"
|
||
|
|
||
|
#include <algorithm>
|
||
|
#include <memory>
|
||
|
#include <string>
|
||
|
#include <utility>
|
||
|
#include <vector>
|
||
|
|
||
|
#include "base/bind.h"
|
||
|
#include "base/logging.h"
|
||
|
#include "base/numerics/safe_conversions.h"
|
||
|
#include "base/strings/stringprintf.h"
|
||
|
#include "components/zucchini/binary_data_histogram.h"
|
||
|
#include "components/zucchini/element_detection.h"
|
||
|
#include "components/zucchini/image_utils.h"
|
||
|
#include "components/zucchini/io_utils.h"
|
||
|
|
||
|
namespace zucchini {
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
/******** Helper Functions ********/
|
||
|
|
||
|
// Uses |detector| to find embedded executables inside |image|, and returns the
|
||
|
// result on success, or absl::nullopt on failure, which occurs if too many (>
|
||
|
// |kElementLimit|) elements are found.
|
||
|
absl::optional<std::vector<Element>> FindEmbeddedElements(
|
||
|
ConstBufferView image,
|
||
|
const std::string& name,
|
||
|
ElementDetector&& detector) {
|
||
|
// Maximum number of Elements in a file. This is enforced because our matching
|
||
|
// algorithm is O(n^2), which suffices for regular archive files that should
|
||
|
// have up to 10's of executable files. An archive containing 100's of
|
||
|
// executables is likely pathological, and is rejected to prevent exploits.
|
||
|
static constexpr size_t kElementLimit = 256;
|
||
|
std::vector<Element> elements;
|
||
|
ElementFinder element_finder(image, std::move(detector));
|
||
|
for (auto element = element_finder.GetNext();
|
||
|
element.has_value() && elements.size() <= kElementLimit;
|
||
|
element = element_finder.GetNext()) {
|
||
|
elements.push_back(*element);
|
||
|
}
|
||
|
if (elements.size() >= kElementLimit) {
|
||
|
LOG(WARNING) << name << ": Found too many elements.";
|
||
|
return absl::nullopt;
|
||
|
}
|
||
|
LOG(INFO) << name << ": Found " << elements.size() << " elements.";
|
||
|
return elements;
|
||
|
}
|
||
|
|
||
|
// Determines whether a proposed comparison between Elements should be rejected
|
||
|
// early, to decrease the likelihood of creating false-positive matches, which
|
||
|
// may be costly for patching. Our heuristic simply prohibits big difference in
|
||
|
// size (relative and absolute) between matched elements.
|
||
|
bool UnsafeDifference(const Element& old_element, const Element& new_element) {
|
||
|
static constexpr double kMaxBloat = 2.0;
|
||
|
static constexpr size_t kMinWorrysomeDifference = 2 << 20; // 2MB
|
||
|
size_t lo_size = std::min(old_element.size, new_element.size);
|
||
|
size_t hi_size = std::max(old_element.size, new_element.size);
|
||
|
if (hi_size - lo_size < kMinWorrysomeDifference)
|
||
|
return false;
|
||
|
if (hi_size < lo_size * kMaxBloat)
|
||
|
return false;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
std::ostream& operator<<(std::ostream& stream, const Element& elt) {
|
||
|
stream << "(" << CastExecutableTypeToString(elt.exe_type) << ", "
|
||
|
<< AsHex<8, size_t>(elt.offset) << " +" << AsHex<8, size_t>(elt.size)
|
||
|
<< ")";
|
||
|
return stream;
|
||
|
}
|
||
|
|
||
|
/******** MatchingInfoOut ********/
|
||
|
|
||
|
// A class to output detailed information during ensemble matching. Extracting
|
||
|
// the functionality to a separate class decouples formatting and printing logic
|
||
|
// from matching logic. The base class consists of stubs.
|
||
|
class MatchingInfoOut {
|
||
|
protected:
|
||
|
MatchingInfoOut() = default;
|
||
|
MatchingInfoOut(const MatchingInfoOut&) = delete;
|
||
|
const MatchingInfoOut& operator=(const MatchingInfoOut&) = delete;
|
||
|
|
||
|
public:
|
||
|
virtual ~MatchingInfoOut() = default;
|
||
|
virtual void InitSizes(size_t old_size, size_t new_size) {}
|
||
|
virtual void DeclareTypeMismatch(int iold, int inew) {}
|
||
|
virtual void DeclareUnsafeDistance(int iold, int inew) {}
|
||
|
virtual void DeclareCandidate(int iold, int inew) {}
|
||
|
virtual void DeclareMatch(int iold,
|
||
|
int inew,
|
||
|
double dist,
|
||
|
bool is_identical) {}
|
||
|
virtual void DeclareOutlier(int iold, int inew) {}
|
||
|
|
||
|
virtual void OutputCompare(const Element& old_element,
|
||
|
const Element& new_element,
|
||
|
double dist) {}
|
||
|
|
||
|
virtual void OutputMatch(const Element& best_old_element,
|
||
|
const Element& new_element,
|
||
|
bool is_identical,
|
||
|
double best_dist) {}
|
||
|
|
||
|
virtual void OutputScores(const std::string& stats) {}
|
||
|
|
||
|
virtual void OutputTextGrid() {}
|
||
|
};
|
||
|
|
||
|
/******** MatchingInfoTerse ********/
|
||
|
|
||
|
// A terse MatchingInfoOut that prints only basic information, using LOG().
|
||
|
class MatchingInfoOutTerse : public MatchingInfoOut {
|
||
|
public:
|
||
|
MatchingInfoOutTerse() = default;
|
||
|
MatchingInfoOutTerse(const MatchingInfoOutTerse&) = delete;
|
||
|
const MatchingInfoOutTerse& operator=(const MatchingInfoOutTerse&) = delete;
|
||
|
~MatchingInfoOutTerse() override = default;
|
||
|
|
||
|
void OutputScores(const std::string& stats) override {
|
||
|
LOG(INFO) << "Best dists: " << stats;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/******** MatchingInfoOutVerbose ********/
|
||
|
|
||
|
// A verbose MatchingInfoOut that prints detailed information using |out_|,
|
||
|
// including comparison pairs, scores, and a text grid representation of
|
||
|
// pairwise matching results.
|
||
|
class MatchingInfoOutVerbose : public MatchingInfoOut {
|
||
|
public:
|
||
|
explicit MatchingInfoOutVerbose(std::ostream& out) : out_(out) {}
|
||
|
MatchingInfoOutVerbose(const MatchingInfoOutVerbose&) = delete;
|
||
|
const MatchingInfoOutVerbose& operator=(const MatchingInfoOutVerbose&) =
|
||
|
delete;
|
||
|
~MatchingInfoOutVerbose() override = default;
|
||
|
|
||
|
// Outputs sizes and initializes |text_grid_|.
|
||
|
void InitSizes(size_t old_size, size_t new_size) override {
|
||
|
out_ << "Comparing old (" << old_size << " elements) and new (" << new_size
|
||
|
<< " elements)" << std::endl;
|
||
|
text_grid_.assign(new_size, std::string(old_size, '-'));
|
||
|
best_dist_.assign(new_size, -1.0);
|
||
|
}
|
||
|
|
||
|
// Functions to update match status in text grid representation.
|
||
|
|
||
|
void DeclareTypeMismatch(int iold, int inew) override {
|
||
|
text_grid_[inew][iold] = 'T';
|
||
|
}
|
||
|
void DeclareUnsafeDistance(int iold, int inew) override {
|
||
|
text_grid_[inew][iold] = 'U';
|
||
|
}
|
||
|
void DeclareCandidate(int iold, int inew) override {
|
||
|
text_grid_[inew][iold] = 'C'; // Provisional.
|
||
|
}
|
||
|
void DeclareMatch(int iold,
|
||
|
int inew,
|
||
|
double dist,
|
||
|
bool is_identical) override {
|
||
|
text_grid_[inew][iold] = is_identical ? 'I' : 'M';
|
||
|
best_dist_[inew] = dist;
|
||
|
}
|
||
|
void DeclareOutlier(int iold, int inew) override {
|
||
|
text_grid_[inew][iold] = 'O';
|
||
|
}
|
||
|
|
||
|
// Functions to print detailed information.
|
||
|
|
||
|
void OutputCompare(const Element& old_element,
|
||
|
const Element& new_element,
|
||
|
double dist) override {
|
||
|
out_ << "Compare old" << old_element << " to new" << new_element << " --> "
|
||
|
<< base::StringPrintf("%.5f", dist) << std::endl;
|
||
|
}
|
||
|
|
||
|
void OutputMatch(const Element& best_old_element,
|
||
|
const Element& new_element,
|
||
|
bool is_identical,
|
||
|
double best_dist) override {
|
||
|
if (is_identical) {
|
||
|
out_ << "Skipped old" << best_old_element << " - identical to new"
|
||
|
<< new_element;
|
||
|
} else {
|
||
|
out_ << "Matched old" << best_old_element << " to new" << new_element
|
||
|
<< " --> " << base::StringPrintf("%.5f", best_dist);
|
||
|
}
|
||
|
out_ << std::endl;
|
||
|
}
|
||
|
|
||
|
void OutputScores(const std::string& stats) override {
|
||
|
out_ << "Best dists: " << stats << std::endl;
|
||
|
}
|
||
|
|
||
|
void OutputTextGrid() override {
|
||
|
int new_size = static_cast<int>(text_grid_.size());
|
||
|
for (int inew = 0; inew < new_size; ++inew) {
|
||
|
const std::string& line = text_grid_[inew];
|
||
|
out_ << " ";
|
||
|
for (char ch : line) {
|
||
|
char prefix = (ch == 'I' || ch == 'M') ? '(' : ' ';
|
||
|
char suffix = (ch == 'I' || ch == 'M') ? ')' : ' ';
|
||
|
out_ << prefix << ch << suffix;
|
||
|
}
|
||
|
if (best_dist_[inew] >= 0)
|
||
|
out_ << " " << base::StringPrintf("%.5f", best_dist_[inew]);
|
||
|
out_ << std::endl;
|
||
|
}
|
||
|
if (!text_grid_.empty()) {
|
||
|
out_ << " Legend: I = identical, M = matched, T = type mismatch, "
|
||
|
"U = unsafe distance, C = candidate, O = outlier, - = skipped."
|
||
|
<< std::endl;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
std::ostream& out_;
|
||
|
|
||
|
// Text grid representation of matches. Rows correspond to "old" and columns
|
||
|
// correspond to "new".
|
||
|
std::vector<std::string> text_grid_;
|
||
|
|
||
|
// For each "new" element, distance of best match. -1 denotes no match.
|
||
|
std::vector<double> best_dist_;
|
||
|
};
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
/******** HeuristicEnsembleMatcher ********/
|
||
|
|
||
|
HeuristicEnsembleMatcher::HeuristicEnsembleMatcher(std::ostream* out)
|
||
|
: out_(out) {}
|
||
|
|
||
|
HeuristicEnsembleMatcher::~HeuristicEnsembleMatcher() = default;
|
||
|
|
||
|
bool HeuristicEnsembleMatcher::RunMatch(ConstBufferView old_image,
|
||
|
ConstBufferView new_image) {
|
||
|
DCHECK(matches_.empty());
|
||
|
LOG(INFO) << "Start matching.";
|
||
|
|
||
|
// Find all elements in "old" and "new".
|
||
|
absl::optional<std::vector<Element>> old_elements =
|
||
|
FindEmbeddedElements(old_image, "Old file",
|
||
|
base::BindRepeating(DetectElementFromDisassembler));
|
||
|
if (!old_elements.has_value())
|
||
|
return false;
|
||
|
absl::optional<std::vector<Element>> new_elements =
|
||
|
FindEmbeddedElements(new_image, "New file",
|
||
|
base::BindRepeating(DetectElementFromDisassembler));
|
||
|
if (!new_elements.has_value())
|
||
|
return false;
|
||
|
|
||
|
std::unique_ptr<MatchingInfoOut> info_out;
|
||
|
if (out_)
|
||
|
info_out = std::make_unique<MatchingInfoOutVerbose>(*out_);
|
||
|
else
|
||
|
info_out = std::make_unique<MatchingInfoOutTerse>();
|
||
|
|
||
|
const int num_new_elements = base::checked_cast<int>(new_elements->size());
|
||
|
const int num_old_elements = base::checked_cast<int>(old_elements->size());
|
||
|
info_out->InitSizes(num_old_elements, num_new_elements);
|
||
|
|
||
|
// For each "new" element, match it with the "old" element that's nearest to
|
||
|
// it, with distance determined by BinaryDataHistogram. The resulting
|
||
|
// "old"-"new" pairs are stored into |results|. Possibilities:
|
||
|
// - Type mismatch: No match.
|
||
|
// - UnsafeDifference() heuristics fail: No match.
|
||
|
// - Identical match: Skip "new" since this is a trivial case.
|
||
|
// - Non-identical match: Match "new" with "old" with min distance.
|
||
|
// - No match: Skip "new".
|
||
|
struct Results {
|
||
|
int iold;
|
||
|
int inew;
|
||
|
double dist;
|
||
|
};
|
||
|
std::vector<Results> results;
|
||
|
|
||
|
// Precompute histograms for "old" since they get reused.
|
||
|
std::vector<BinaryDataHistogram> old_his(num_old_elements);
|
||
|
for (int iold = 0; iold < num_old_elements; ++iold) {
|
||
|
ConstBufferView sub_image(old_image[(*old_elements)[iold]]);
|
||
|
old_his[iold].Compute(sub_image);
|
||
|
// ProgramDetector should have imposed minimal size limit to |sub_image|.
|
||
|
// Therefore resulting histogram are expected to be valid.
|
||
|
CHECK(old_his[iold].IsValid());
|
||
|
}
|
||
|
|
||
|
const int kUninitIold = num_old_elements;
|
||
|
for (int inew = 0; inew < num_new_elements; ++inew) {
|
||
|
const Element& cur_new_element = (*new_elements)[inew];
|
||
|
ConstBufferView cur_new_sub_image(new_image[cur_new_element.region()]);
|
||
|
BinaryDataHistogram new_his;
|
||
|
new_his.Compute(cur_new_sub_image);
|
||
|
CHECK(new_his.IsValid());
|
||
|
|
||
|
double best_dist = HUGE_VAL;
|
||
|
int best_iold = kUninitIold;
|
||
|
bool is_identical = false;
|
||
|
|
||
|
for (int iold = 0; iold < num_old_elements; ++iold) {
|
||
|
const Element& cur_old_element = (*old_elements)[iold];
|
||
|
if (cur_old_element.exe_type != cur_new_element.exe_type) {
|
||
|
info_out->DeclareTypeMismatch(iold, inew);
|
||
|
continue;
|
||
|
}
|
||
|
if (UnsafeDifference(cur_old_element, cur_new_element)) {
|
||
|
info_out->DeclareUnsafeDistance(iold, inew);
|
||
|
continue;
|
||
|
}
|
||
|
double dist = old_his[iold].Distance(new_his);
|
||
|
info_out->DeclareCandidate(iold, inew);
|
||
|
info_out->OutputCompare(cur_old_element, cur_new_element, dist);
|
||
|
if (best_dist > dist) { // Tie resolution: First-one, first-serve.
|
||
|
best_iold = iold;
|
||
|
best_dist = dist;
|
||
|
if (best_dist == 0) {
|
||
|
ConstBufferView sub_image(old_image[cur_old_element.region()]);
|
||
|
if (sub_image.equals(cur_new_sub_image)) {
|
||
|
is_identical = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (best_iold != kUninitIold) {
|
||
|
const Element& best_old_element = (*old_elements)[best_iold];
|
||
|
info_out->DeclareMatch(best_iold, inew, best_dist, is_identical);
|
||
|
if (is_identical) // Skip "new" if identical match is found.
|
||
|
++num_identical_;
|
||
|
else
|
||
|
results.push_back({best_iold, inew, best_dist});
|
||
|
info_out->OutputMatch(best_old_element, cur_new_element, is_identical,
|
||
|
best_dist);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Populate |matches_| from |result|. To reduce that chance of false-positive
|
||
|
// matches, statistics on dists are computed. If a match's |dist| is an
|
||
|
// outlier then it is rejected.
|
||
|
if (results.size() > 0) {
|
||
|
OutlierDetector detector;
|
||
|
for (const auto& result : results) {
|
||
|
if (result.dist > 0)
|
||
|
detector.Add(result.dist);
|
||
|
}
|
||
|
detector.Prepare();
|
||
|
info_out->OutputScores(detector.RenderStats());
|
||
|
for (const Results& result : results) {
|
||
|
if (detector.DecideOutlier(result.dist) > 0) {
|
||
|
info_out->DeclareOutlier(result.iold, result.inew);
|
||
|
} else {
|
||
|
matches_.push_back(
|
||
|
{(*old_elements)[result.iold], (*new_elements)[result.inew]});
|
||
|
}
|
||
|
}
|
||
|
info_out->OutputTextGrid();
|
||
|
}
|
||
|
|
||
|
Trim();
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
} // namespace zucchini
|