vendor/tomotopy/src/Labeling/Phraser.hpp in tomoto-0.1.4 vs vendor/tomotopy/src/Labeling/Phraser.hpp in tomoto-0.2.0

- old
+ new

@@ -1,16 +1,39 @@ #pragma once #include <vector> +#include <map> #include <unordered_map> #include "Labeler.h" #include "../Utils/Trie.hpp" +#ifdef TMT_USE_BTREE +#include "btree/map.h" +#else +#endif + namespace tomoto { namespace phraser { +#ifdef TMT_USE_BTREE + template<typename K, typename V> using map = btree::map<K, V>; +#else + template<typename K, typename V> using map = std::map<K, V>; +#endif + + namespace detail + { + struct vvhash + { + size_t operator()(const std::pair<Vid, Vid>& k) const + { + return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second); + } + }; + } + template<typename _DocIter> void countUnigrams(std::vector<size_t>& unigramCf, std::vector<size_t>& unigramDf, _DocIter docBegin, _DocIter docEnd ) { @@ -28,21 +51,21 @@ for (auto w : uniqs) unigramDf[w]++; } } - template<typename _DocIter, typename _VvHash, typename _Freqs> - void countBigrams(std::unordered_map<std::pair<Vid, Vid>, size_t, _VvHash>& bigramCf, - std::unordered_map<std::pair<Vid, Vid>, size_t, _VvHash>& bigramDf, + template<typename _DocIter, typename _Freqs> + void countBigrams(map<std::pair<Vid, Vid>, size_t>& bigramCf, + map<std::pair<Vid, Vid>, size_t>& bigramDf, _DocIter docBegin, _DocIter docEnd, _Freqs&& vocabFreqs, _Freqs&& vocabDf, size_t candMinCnt, size_t candMinDf ) { for (auto docIt = docBegin; docIt != docEnd; ++docIt) { - std::unordered_set<std::pair<Vid, Vid>, _VvHash> uniqBigram; + std::unordered_set<std::pair<Vid, Vid>, detail::vvhash> uniqBigram; auto doc = *docIt; if (!doc.size()) continue; Vid prevWord = doc[0]; for (size_t j = 1; j < doc.size(); ++j) { @@ -200,36 +223,25 @@ } } return std::move(data[0]); } - namespace detail - { - struct vvhash - { - size_t operator()(const std::pair<Vid, Vid>& k) const - { - return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second); - } - }; - } - template<typename _DocIter, typename _Freqs> std::vector<label::Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd, _Freqs&& vocabFreqs, _Freqs&& vocabDf, size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore, bool normalized = false, ThreadPool* pool = nullptr) { // counting unigrams & bigrams - std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> bigramCnt, bigramDf; + map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf; if (pool && pool->getNumWorkers() > 1) { using LocalCfDf = std::pair< - std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>, - std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> + decltype(bigramCnt), + decltype(bigramDf) >; std::vector<LocalCfDf> localdata(pool->getNumWorkers()); std::vector<std::future<void>> futures; const size_t stride = pool->getNumWorkers() * 8; auto docIt = docBegin; @@ -361,16 +373,16 @@ size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minNPMI = 0, float minNBE = 0, ThreadPool* pool = nullptr) { // counting unigrams & bigrams - std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> bigramCnt, bigramDf; + map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf; if (pool && pool->getNumWorkers() > 1) { using LocalCfDf = std::pair< - std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>, - std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> + decltype(bigramCnt), + decltype(bigramDf) >; std::vector<LocalCfDf> localdata(pool->getNumWorkers()); std::vector<std::future<void>> futures; const size_t stride = pool->getNumWorkers() * 8; auto docIt = docBegin;