ext/cppjieba/include/cppjieba/Unicode.hpp in cppjieba_rb-0.3.0 vs ext/cppjieba/include/cppjieba/Unicode.hpp in cppjieba_rb-0.3.1

- old
+ new

@@ -16,28 +16,38 @@ typedef uint32_t Rune; struct Word { string word; uint32_t offset; + uint32_t unicode_offset; + uint32_t unicode_length; Word(const string& w, uint32_t o) : word(w), offset(o) { } + Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length) + : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) { + } }; // struct Word inline std::ostream& operator << (std::ostream& os, const Word& w) { return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}"; } struct RuneStr { Rune rune; uint32_t offset; uint32_t len; + uint32_t unicode_offset; + uint32_t unicode_length; RuneStr(): rune(0), offset(0), len(0) { } RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) { } + RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length) + : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) { + } }; // struct RuneStr inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}"; } @@ -130,19 +140,20 @@ } inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { runes.clear(); runes.reserve(len / 2); - for (size_t i = 0; i < len;) { + for (uint32_t i = 0, j = 0; i < len;) { RuneStrLite rp = DecodeRuneInString(s + i, len - i); if (rp.len == 0) { runes.clear(); return false; } - RuneStr x(rp.rune, i, rp.len); + RuneStr x(rp.rune, i, rp.len, j, 1); runes.push_back(x); i += rp.len; + ++j; } return true; } inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { @@ -180,10 +191,11 @@ // [left, right] inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->offset >= left->offset); uint32_t len = right->offset - left->offset + right->len; - return Word(s.substr(left->offset, len), left->offset); + uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length; + return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length); } inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->offset >= left->offset); uint32_t len = right->offset - left->offset + right->len;