vendor/tomotopy/src/Labeling/FoRelevance.h in tomoto-0.1.3 vs vendor/tomotopy/src/Labeling/FoRelevance.h in tomoto-0.1.4

- old
+ new

@@ -2,10 +2,11 @@ #include <set> #include "Labeler.h" #include "../Utils/EigenAddonOps.hpp" #include "../Utils/Trie.hpp" +#include "../Utils/ThreadPool.hpp" /* Implementation of First-order Relevance for topic labeling by bab2min * Mei, Q., Shen, X., & Zhai, C. (2007, August). Automatic labeling of multinomial topic models. In Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 490-499). @@ -14,170 +15,39 @@ namespace tomoto { namespace label { - template<typename _DocIter, typename _Freqs> - std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd, - _Freqs&& vocabFreqs, _Freqs&& vocabDf, - size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore) + class PMIExtractor : public IExtractor { - struct vvhash + size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates; + bool normalized; + public: + PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, + size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000, + bool _normalized = false + ) + : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, + minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, + maxCandidates{ _maxCandidates }, normalized{ _normalized } { - size_t operator()(const std::pair<Vid, Vid>& k) const - { - return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second); - } - }; - - // counting unigrams & bigrams - std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf; - - for(auto docIt = docBegin; docIt != docEnd; ++docIt) - { - std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram; - auto doc = *docIt; - Vid prevWord = doc[0]; - for (size_t j = 1; j < doc.size(); ++j) - { - Vid curWord = doc[j]; - if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf) - { - if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf) - { - bigramCnt[std::make_pair(prevWord, curWord)]++; - uniqBigram.emplace(prevWord, curWord); - } - } - prevWord = curWord; - } - - for (auto& p : uniqBigram) bigramDf[p]++; } + std::vector<Candidate> extract(const ITopicModel* tm) const override; + }; - // counting ngrams - std::vector<TrieEx<Vid, size_t>> trieNodes; - - if (maxNgrams > 2) - { - std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair; - for (auto& p : bigramCnt) - { - if (p.second >= candMinCnt) validPair.emplace(p.first); - } - - trieNodes.resize(1); - auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); }; - - for (auto docIt = docBegin; docIt != docEnd; ++docIt) - { - auto doc = *docIt; - if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams) - { - trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2)); - } - - Vid prevWord = doc[0]; - size_t labelLen = 0; - auto node = &trieNodes[0]; - if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt) - { - node = trieNodes[0].makeNext(prevWord, allocNode); - node->val++; - labelLen = 1; - } - - for (size_t j = 1; j < doc.size(); ++j) - { - Vid curWord = doc[j]; - - if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt) - { - node = &trieNodes[0]; - labelLen = 0; - } - else - { - if (labelLen >= maxNgrams) - { - node = node->getFail(); - labelLen--; - } - - if (validPair.count(std::make_pair(prevWord, curWord))) - { - auto nnode = node->makeNext(curWord, allocNode); - node = nnode; - do - { - nnode->val++; - } while (nnode = nnode->getFail()); - labelLen++; - } - else - { - node = trieNodes[0].makeNext(curWord, allocNode); - node->val++; - labelLen = 1; - } - } - prevWord = curWord; - } - } - } - - float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0); - - // calculating PMIs - std::vector<Candidate> candidates; - for (auto& p : bigramCnt) - { - auto& bigram = p.first; - if (p.second < candMinCnt) continue; - if (bigramDf[bigram] < candMinDf) continue; - auto pmi = std::log(p.second * totN - / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]); - if (pmi <= 0) continue; - candidates.emplace_back(pmi, bigram.first, bigram.second); - } - - if (maxNgrams > 2) - { - std::vector<Vid> rkeys; - trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys) - { - if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return; - auto pmi = node->val / totN; - for (auto k : rkeys) - { - pmi *= totN / vocabFreqs[k]; - } - pmi = std::log(pmi); - if (pmi < minScore) return; - candidates.emplace_back(pmi, rkeys); - }, rkeys); - } - - std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b) - { - return a.score > b.score; - }); - if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end()); - return candidates; - } - - - class PMIExtractor : public IExtractor + class PMIBEExtractor : public IExtractor { size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates; public: - PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000) - : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates } + PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, + size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000 + ) + : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates } { } - + std::vector<Candidate> extract(const ITopicModel* tm) const override; }; class FoRelevance : public ILabeler { @@ -210,10 +80,10 @@ void estimateContexts(); public: template<typename _Iter> - FoRelevance(const ITopicModel* _tm, + FoRelevance(const ITopicModel* _tm, _Iter candFirst, _Iter candEnd, size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f, size_t _windowSize = (size_t)-1, size_t numWorkers = 0) : tm{ _tm }, candMinDf{ _candMinDf },