vendor/tomotopy/src/Labeling/FoRelevance.h in tomoto-0.1.2 vs vendor/tomotopy/src/Labeling/FoRelevance.h in tomoto-0.1.3
- old
+ new
@@ -14,16 +14,167 @@
namespace tomoto
{
namespace label
{
+ template<typename _DocIter, typename _Freqs>
+ std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
+ _Freqs&& vocabFreqs, _Freqs&& vocabDf,
+ size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
+ {
+ struct vvhash
+ {
+ size_t operator()(const std::pair<Vid, Vid>& k) const
+ {
+ return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
+ }
+ };
+
+ // counting unigrams & bigrams
+ std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
+
+ for(auto docIt = docBegin; docIt != docEnd; ++docIt)
+ {
+ std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
+ auto doc = *docIt;
+ Vid prevWord = doc[0];
+ for (size_t j = 1; j < doc.size(); ++j)
+ {
+ Vid curWord = doc[j];
+ if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
+ {
+ if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
+ {
+ bigramCnt[std::make_pair(prevWord, curWord)]++;
+ uniqBigram.emplace(prevWord, curWord);
+ }
+ }
+ prevWord = curWord;
+ }
+
+ for (auto& p : uniqBigram) bigramDf[p]++;
+ }
+
+
+ // counting ngrams
+ std::vector<TrieEx<Vid, size_t>> trieNodes;
+
+ if (maxNgrams > 2)
+ {
+ std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
+ for (auto& p : bigramCnt)
+ {
+ if (p.second >= candMinCnt) validPair.emplace(p.first);
+ }
+
+ trieNodes.resize(1);
+ auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
+
+ for (auto docIt = docBegin; docIt != docEnd; ++docIt)
+ {
+ auto doc = *docIt;
+ if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
+ {
+ trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
+ }
+
+ Vid prevWord = doc[0];
+ size_t labelLen = 0;
+ auto node = &trieNodes[0];
+ if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
+ {
+ node = trieNodes[0].makeNext(prevWord, allocNode);
+ node->val++;
+ labelLen = 1;
+ }
+
+ for (size_t j = 1; j < doc.size(); ++j)
+ {
+ Vid curWord = doc[j];
+
+ if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
+ {
+ node = &trieNodes[0];
+ labelLen = 0;
+ }
+ else
+ {
+ if (labelLen >= maxNgrams)
+ {
+ node = node->getFail();
+ labelLen--;
+ }
+
+ if (validPair.count(std::make_pair(prevWord, curWord)))
+ {
+ auto nnode = node->makeNext(curWord, allocNode);
+ node = nnode;
+ do
+ {
+ nnode->val++;
+ } while (nnode = nnode->getFail());
+ labelLen++;
+ }
+ else
+ {
+ node = trieNodes[0].makeNext(curWord, allocNode);
+ node->val++;
+ labelLen = 1;
+ }
+ }
+ prevWord = curWord;
+ }
+ }
+ }
+
+ float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
+
+ // calculating PMIs
+ std::vector<Candidate> candidates;
+ for (auto& p : bigramCnt)
+ {
+ auto& bigram = p.first;
+ if (p.second < candMinCnt) continue;
+ if (bigramDf[bigram] < candMinDf) continue;
+ auto pmi = std::log(p.second * totN
+ / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
+ if (pmi <= 0) continue;
+ candidates.emplace_back(pmi, bigram.first, bigram.second);
+ }
+
+ if (maxNgrams > 2)
+ {
+ std::vector<Vid> rkeys;
+ trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
+ {
+ if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
+ auto pmi = node->val / totN;
+ for (auto k : rkeys)
+ {
+ pmi *= totN / vocabFreqs[k];
+ }
+ pmi = std::log(pmi);
+ if (pmi < minScore) return;
+ candidates.emplace_back(pmi, rkeys);
+ }, rkeys);
+ }
+
+ std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
+ {
+ return a.score > b.score;
+ });
+ if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
+ return candidates;
+ }
+
+
class PMIExtractor : public IExtractor
{
- size_t candMinCnt, candMinDf, maxLabelLen, maxCandidates;
+ size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
public:
- PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
- : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
+ PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
+ : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
{
}
std::vector<Candidate> extract(const ITopicModel* tm) const override;
};
@@ -31,11 +182,11 @@
class FoRelevance : public ILabeler
{
struct CandidateEx : public Candidate
{
std::unordered_map<std::string, size_t> names;
- std::vector<size_t> docIds;
+ std::set<size_t> docIds;
Eigen::Array<Float, -1, 1> scores;
CandidateEx()
{
}
@@ -47,10 +198,11 @@
};
const ITopicModel* tm;
size_t candMinDf;
float smoothing, lambda, mu;
+ size_t windowSize;
std::unique_ptr<ThreadPool> pool;
std::unique_ptr<std::mutex[]> mtx;
std::vector<CandidateEx> candidates;
template<bool _lock>
@@ -61,12 +213,13 @@
public:
template<typename _Iter>
FoRelevance(const ITopicModel* _tm,
_Iter candFirst, _Iter candEnd,
size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
+ size_t _windowSize = (size_t)-1,
size_t numWorkers = 0)
: tm{ _tm }, candMinDf{ _candMinDf },
- smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }
+ smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }, windowSize{ _windowSize }
{
if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
if (numWorkers > 1)
{
pool = make_unique<ThreadPool>(numWorkers);