99 lines
3.9 KiB
C++
99 lines
3.9 KiB
C++
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
|
|
#define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
|
|
|
|
#include <mutex> // NOLINT: see comments for state_mutex_
|
|
#include <string>
|
|
|
|
#include "lang_id/common/fel/feature-extractor.h"
|
|
#include "lang_id/common/fel/task-context.h"
|
|
#include "lang_id/common/fel/workspace.h"
|
|
#include "lang_id/features/light-sentence-features.h"
|
|
#include "lang_id/light-sentence.h"
|
|
|
|
// TODO(abakalov): Add a test.
|
|
namespace libtextclassifier3 {
|
|
namespace mobile {
|
|
namespace lang_id {
|
|
|
|
// Class for computing continuous char ngram features.
|
|
//
|
|
// Feature function descriptor parameters:
|
|
// include_terminators(bool, false):
|
|
// If 'true', then splits the text based on spaces to get tokens, adds "^"
|
|
// to the beginning of each token, and adds "$" to the end of each token.
|
|
// NOTE: currently, we support only include_terminators=true.
|
|
// include_spaces(bool, false):
|
|
// If 'true', then includes char ngrams containing spaces.
|
|
// NOTE: currently, we support only include_spaces=false.
|
|
// use_equal_weight(bool, false):
|
|
// If 'true', then weighs each unique ngram by 1.0 / (number of unique
|
|
// ngrams in the input). Otherwise, weighs each unique ngram by (ngram
|
|
// count) / (total number of ngrams).
|
|
// NOTE: currently, we support only use_equal_weight=false.
|
|
// id_dim(int, 10000):
|
|
// The integer id of each char ngram is computed as follows:
|
|
// Hash32WithDefault(char ngram) % id_dim.
|
|
// size(int, 3):
|
|
// Only ngrams of this size will be extracted.
|
|
//
|
|
// NOTE: this class is not thread-safe. TODO(salcianu): make it thread-safe.
|
|
class ContinuousBagOfNgramsFunction : public LightSentenceFeature {
|
|
public:
|
|
bool Setup(TaskContext *context) override;
|
|
bool Init(TaskContext *context) override;
|
|
|
|
// Appends the features computed from the sentence to the feature vector.
|
|
void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence,
|
|
FeatureVector *result) const override;
|
|
|
|
SAFTM_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams",
|
|
ContinuousBagOfNgramsFunction);
|
|
|
|
private:
|
|
// Auxiliary for Evaluate(). Fills counts_ and non_zero_count_indices_ (see
|
|
// below), and returns the total ngram count.
|
|
int ComputeNgramCounts(const LightSentence &sentence) const;
|
|
|
|
// Guards counts_ and non_zero_count_indices_. NOTE: we use std::* constructs
|
|
// (instead of absl::Mutex & co) to simplify porting to Android and to avoid
|
|
// pulling in absl (which increases our code size).
|
|
mutable std::mutex state_mutex_;
|
|
|
|
// counts_[i] is the count of all ngrams with id i. Work data for Evaluate().
|
|
// NOTE: we declare this vector as a field, such that its underlying capacity
|
|
// stays allocated in between calls to Evaluate().
|
|
mutable std::vector<int> counts_;
|
|
|
|
// Indices of non-zero elements of counts_. See comments for counts_.
|
|
mutable std::vector<int> non_zero_count_indices_;
|
|
|
|
// The integer id of each char ngram is computed as follows:
|
|
// Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
|
|
int ngram_id_dimension_;
|
|
|
|
// Only ngrams of size ngram_size_ will be extracted.
|
|
int ngram_size_;
|
|
};
|
|
|
|
} // namespace lang_id
|
|
} // namespace mobile
|
|
} // namespace nlp_saft
|
|
|
|
#endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
|