/** * Copyright (c) 2016-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include "args.h" #include "real.h" namespace fasttext { typedef int32_t id_type; enum class entry_type : int8_t { word = 0, label = 1 }; struct entry { std::string word; int64_t count; entry_type type; std::vector subwords; }; class Dictionary { protected: static const int32_t MAX_VOCAB_SIZE = 30000000; static const int32_t MAX_LINE_SIZE = 1024; int32_t find(const std::string&) const; int32_t find(const std::string&, uint32_t h) const; void initTableDiscard(); void initNgrams(); void reset(std::istream&) const; void pushHash(std::vector&, int32_t) const; void addSubwords(std::vector&, const std::string&, int32_t) const; std::shared_ptr args_; std::vector word2int_; std::vector words_; std::vector pdiscard_; int32_t size_; int32_t nwords_; int32_t nlabels_; int64_t ntokens_; int64_t pruneidx_size_; std::unordered_map pruneidx_; void addWordNgrams( std::vector& line, const std::vector& hashes, int32_t n) const; public: static const std::string EOS; static const std::string BOW; static const std::string EOW; explicit Dictionary(std::shared_ptr); explicit Dictionary(std::shared_ptr, std::istream&); int32_t nwords() const; int32_t nlabels() const; int64_t ntokens() const; int32_t getId(const std::string&) const; int32_t getId(const std::string&, uint32_t h) const; entry_type getType(int32_t) const; entry_type getType(const std::string&) const; bool discard(int32_t, real) const; std::string getWord(int32_t) const; const std::vector& getSubwords(int32_t) const; const std::vector getSubwords(const std::string&) const; void getSubwords( const std::string&, std::vector&, std::vector&) const; void computeSubwords( const std::string&, std::vector&, std::vector* substrings = nullptr) const; uint32_t hash(const std::string& str) const; void add(const std::string&); bool readWord(std::istream&, std::string&) const; void readFromFile(std::istream&); std::string getLabel(int32_t) const; void save(std::ostream&) const; void load(std::istream&); std::vector getCounts(entry_type) const; int32_t getLine(std::istream&, std::vector&, std::vector&) const; int32_t getLine(std::istream&, std::vector&, std::minstd_rand&) const; void threshold(int64_t, int64_t); void prune(std::vector&); bool isPruned() { return pruneidx_size_ >= 0; } void dump(std::ostream&) const; void init(); }; } // namespace fasttext