vendor/tomotopy/src/TopicModel/MGLDAModel.hpp in tomoto-0.1.2 vs vendor/tomotopy/src/TopicModel/MGLDAModel.hpp in tomoto-0.1.3
- old
+ new
@@ -383,57 +383,16 @@
if (_T == 0 || _T >= 0x80000000) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong T value (T = %zd)", _T));
if (_alphaL <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong alphaL value (alphaL = %f)", _alphaL));
if (_etaL <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong etaL value (etaL = %f)", _etaL));
}
-
- template<bool _const = false>
- _DocType _makeDoc(const std::vector<std::string>& words, const std::string& delimiter)
- {
- _DocType doc{ 1.f };
- size_t numSent = 0;
- for (auto& w : words)
- {
- if (w == delimiter)
- {
- ++numSent;
- continue;
- }
-
- Vid id;
- if (_const)
- {
- id = this->dict.toWid(w);
- if (id == (Vid)-1) continue;
- }
- else
- {
- id = this->dict.add(w);
- }
- doc.words.emplace_back(id);
- doc.sents.emplace_back(numSent);
- }
- doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
- return doc;
- }
-
- size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) override
- {
- return this->_addDoc(_makeDoc(words, delimiter));
- }
-
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const override
- {
- return make_unique<_DocType>(as_mutable(this)->template _makeDoc<true>(words, delimiter));
- }
-
template<bool _const, typename _FnTokenizer>
- _DocType _makeRawDoc(const std::string& rawStr, _FnTokenizer&& tokenizer, const std::string& delimiter)
+ _DocType _makeFromRawDoc(const RawDoc& rawDoc, _FnTokenizer&& tokenizer, const std::string& delimiter)
{
- _DocType doc{ 1.f };
+ _DocType doc;
size_t numSent = 0;
- doc.rawStr = rawStr;
+ doc.rawStr = rawDoc.rawStr;
for (auto& p : tokenizer(doc.rawStr))
{
if (std::get<0>(p) == delimiter)
{
++numSent;
@@ -459,60 +418,88 @@
}
doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
return doc;
}
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
- const std::string& delimiter)
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer)
{
- return this->_addDoc(_makeRawDoc<false>(rawStr, tokenizer, delimiter));
+ return this->_addDoc(_makeFromRawDoc<false>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
}
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
- const std::string& delimiter) const
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const
{
- return make_unique<_DocType>(as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer, delimiter));
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
}
- _DocType _makeRawDoc(const std::string& rawStr, const std::vector<Vid>& words,
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len, const std::string& delimiter) const
+ template<bool _const = false>
+ _DocType _makeFromRawDoc(const RawDoc& rawDoc)
{
- _DocType doc{ 1.f };
- doc.rawStr = rawStr;
+ _DocType doc;
+ doc.rawStr = rawDoc.rawStr;
+ auto delimiter = rawDoc.template getMisc<std::string>("delimiter");
size_t numSent = 0;
Vid delimiterId = this->dict.toWid(delimiter);
- for (size_t i = 0; i < words.size(); ++i)
+ if (!rawDoc.rawWords.empty())
{
- auto& w = words[i];
- if (w == delimiterId)
+ for (size_t i = 0; i < rawDoc.rawWords.size(); ++i)
{
- ++numSent;
- continue;
+ auto& w = rawDoc.rawWords[i];
+ if (w == delimiter)
+ {
+ ++numSent;
+ continue;
+ }
+
+ Vid id;
+ if (_const)
+ {
+ id = this->dict.toWid(w);
+ if (id == (Vid)-1) continue;
+ }
+ else
+ {
+ id = this->dict.add(w);
+ }
+ doc.words.emplace_back(id);
+ doc.sents.emplace_back(numSent);
+ if (rawDoc.rawWords.size() == rawDoc.origWordPos.size())
+ {
+ doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
+ doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
+ }
}
- doc.words.emplace_back(w);
- doc.sents.emplace_back(numSent);
- if (words.size() == pos.size())
+ }
+ else if (!rawDoc.words.empty())
+ {
+ for (size_t i = 0; i < rawDoc.words.size(); ++i)
{
- doc.origWordPos.emplace_back(pos[i]);
- doc.origWordLen.emplace_back(len[i]);
+ auto& w = rawDoc.words[i];
+ if (w == delimiterId)
+ {
+ ++numSent;
+ continue;
+ }
+ doc.words.emplace_back(w);
+ doc.sents.emplace_back(numSent);
+ if (rawDoc.words.size() == rawDoc.origWordPos.size())
+ {
+ doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
+ doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
+ }
}
}
doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
return doc;
}
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
- const std::string& delimiter)
+ size_t addDoc(const RawDoc& rawDoc)
{
- return this->_addDoc(_makeRawDoc(rawStr, words, pos, len, delimiter));
+ return this->_addDoc(_makeFromRawDoc(rawDoc));
}
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
- const std::string& delimiter) const
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const
{
- return make_unique<_DocType>(_makeRawDoc(rawStr, words, pos, len, delimiter));
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
}
void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
{
if (priors.size() != this->K + KL) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "priors.size() must be equal to K.");