ext/cppjieba/include/cppjieba/Unicode.hpp in cppjieba_rb-0.3.0 vs ext/cppjieba/include/cppjieba/Unicode.hpp in cppjieba_rb-0.3.1
- old
+ new
@@ -16,28 +16,38 @@
typedef uint32_t Rune;
struct Word {
string word;
uint32_t offset;
+ uint32_t unicode_offset;
+ uint32_t unicode_length;
Word(const string& w, uint32_t o)
: word(w), offset(o) {
}
+ Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
+ : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+ }
}; // struct Word
inline std::ostream& operator << (std::ostream& os, const Word& w) {
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
}
struct RuneStr {
Rune rune;
uint32_t offset;
uint32_t len;
+ uint32_t unicode_offset;
+ uint32_t unicode_length;
RuneStr(): rune(0), offset(0), len(0) {
}
RuneStr(Rune r, uint32_t o, uint32_t l)
: rune(r), offset(o), len(l) {
}
+ RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
+ : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+ }
}; // struct RuneStr
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
}
@@ -130,19 +140,20 @@
}
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
runes.clear();
runes.reserve(len / 2);
- for (size_t i = 0; i < len;) {
+ for (uint32_t i = 0, j = 0; i < len;) {
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
if (rp.len == 0) {
runes.clear();
return false;
}
- RuneStr x(rp.rune, i, rp.len);
+ RuneStr x(rp.rune, i, rp.len, j, 1);
runes.push_back(x);
i += rp.len;
+ ++j;
}
return true;
}
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
@@ -180,10 +191,11 @@
// [left, right]
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
assert(right->offset >= left->offset);
uint32_t len = right->offset - left->offset + right->len;
- return Word(s.substr(left->offset, len), left->offset);
+ uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
+ return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
}
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
assert(right->offset >= left->offset);
uint32_t len = right->offset - left->offset + right->len;