vendor/tomotopy/src/Labeling/Phraser.hpp in tomoto-0.1.4 vs vendor/tomotopy/src/Labeling/Phraser.hpp in tomoto-0.2.0
- old
+ new
@@ -1,16 +1,39 @@
#pragma once
#include <vector>
+#include <map>
#include <unordered_map>
#include "Labeler.h"
#include "../Utils/Trie.hpp"
+#ifdef TMT_USE_BTREE
+#include "btree/map.h"
+#else
+#endif
+
namespace tomoto
{
namespace phraser
{
+#ifdef TMT_USE_BTREE
+ template<typename K, typename V> using map = btree::map<K, V>;
+#else
+ template<typename K, typename V> using map = std::map<K, V>;
+#endif
+
+ namespace detail
+ {
+ struct vvhash
+ {
+ size_t operator()(const std::pair<Vid, Vid>& k) const
+ {
+ return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
+ }
+ };
+ }
+
template<typename _DocIter>
void countUnigrams(std::vector<size_t>& unigramCf, std::vector<size_t>& unigramDf,
_DocIter docBegin, _DocIter docEnd
)
{
@@ -28,21 +51,21 @@
for (auto w : uniqs) unigramDf[w]++;
}
}
- template<typename _DocIter, typename _VvHash, typename _Freqs>
- void countBigrams(std::unordered_map<std::pair<Vid, Vid>, size_t, _VvHash>& bigramCf,
- std::unordered_map<std::pair<Vid, Vid>, size_t, _VvHash>& bigramDf,
+ template<typename _DocIter, typename _Freqs>
+ void countBigrams(map<std::pair<Vid, Vid>, size_t>& bigramCf,
+ map<std::pair<Vid, Vid>, size_t>& bigramDf,
_DocIter docBegin, _DocIter docEnd,
_Freqs&& vocabFreqs, _Freqs&& vocabDf,
size_t candMinCnt, size_t candMinDf
)
{
for (auto docIt = docBegin; docIt != docEnd; ++docIt)
{
- std::unordered_set<std::pair<Vid, Vid>, _VvHash> uniqBigram;
+ std::unordered_set<std::pair<Vid, Vid>, detail::vvhash> uniqBigram;
auto doc = *docIt;
if (!doc.size()) continue;
Vid prevWord = doc[0];
for (size_t j = 1; j < doc.size(); ++j)
{
@@ -200,36 +223,25 @@
}
}
return std::move(data[0]);
}
- namespace detail
- {
- struct vvhash
- {
- size_t operator()(const std::pair<Vid, Vid>& k) const
- {
- return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
- }
- };
- }
-
template<typename _DocIter, typename _Freqs>
std::vector<label::Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
_Freqs&& vocabFreqs, _Freqs&& vocabDf,
size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates,
float minScore, bool normalized = false,
ThreadPool* pool = nullptr)
{
// counting unigrams & bigrams
- std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> bigramCnt, bigramDf;
+ map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
if (pool && pool->getNumWorkers() > 1)
{
using LocalCfDf = std::pair<
- std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>,
- std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>
+ decltype(bigramCnt),
+ decltype(bigramDf)
>;
std::vector<LocalCfDf> localdata(pool->getNumWorkers());
std::vector<std::future<void>> futures;
const size_t stride = pool->getNumWorkers() * 8;
auto docIt = docBegin;
@@ -361,16 +373,16 @@
size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates,
float minNPMI = 0, float minNBE = 0,
ThreadPool* pool = nullptr)
{
// counting unigrams & bigrams
- std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> bigramCnt, bigramDf;
+ map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
if (pool && pool->getNumWorkers() > 1)
{
using LocalCfDf = std::pair<
- std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>,
- std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>
+ decltype(bigramCnt),
+ decltype(bigramDf)
>;
std::vector<LocalCfDf> localdata(pool->getNumWorkers());
std::vector<std::future<void>> futures;
const size_t stride = pool->getNumWorkers() * 8;
auto docIt = docBegin;