// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ #include "encodings/compact_lang_det/letterscript_enum.h" #include "encodings/compact_lang_det/compact_lang_det_impl.h" namespace getone { static const int kMaxScriptBuffer = 4096; static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room static const int kMaxAnswerBuffer = 256; typedef enum UnicodeLScript ULScript; typedef struct { char* text; // Pointer to the span, somewhere int text_bytes; // Number of bytes of text in the span int offset; // Offset of start of span in original input buffer ULScript script; // Script of all the letters in this span Language lang; // Language identified for this span bool truncated; // true if buffer filled up before a // different script or EOF was found } LangSpan; static inline bool IsContinuationByte(char c) { return static_cast(c) < -64; } // Gets lscript number for letters; always returns // 0 (common script) for non-letters int GetUTF8LetterScriptNum(const char* src); // Update src pointer to point to next quadgram, +2..+5 // Looks at src[0..4] const char* AdvanceQuad(const char* src); } // end namespace getone class ScriptScanner { public: ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); ~ScriptScanner(); // Copy next run of same-script non-tag letters to buffer [NUL terminated] bool GetOneScriptSpan(getone::LangSpan* span); // Force Latin and Cyrillic scripts to be lowercase void LowerScriptSpan(getone::LangSpan* span); // Copy next run of same-script non-tag letters to buffer [NUL terminated] // Force Latin and Cyrillic scripts to be lowercase bool GetOneScriptSpanLower(getone::LangSpan* span); private: int SkipToFrontOfSpan(const char* src, int len, int* script); const char* start_byte_; const char* next_byte_; const char* next_byte_limit_; int byte_length_; bool is_plain_text_; char* script_buffer_; // Holds text with expanded entities char* script_buffer_lower_; // Holds lowercased text }; class LangScanner { public: LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj, getone::LangSpan* spn, int smoothwidth, int smoothcandidates, int maxlangs, int minlangspan); ~LangScanner(); int script() {return script_;} // Use new text // Keep smoothing state if same script, otherwise reinit smoothing void NewText(getone::LangSpan* spn); bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping // The real ones bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, getone::LangSpan* span); bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, getone::LangSpan* span); // Increases language bias by delta void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj, Language key, int delta); // For debugging output int next_answer_; char answer_buffer_[getone::kMaxAnswerBuffer]; char answer_buffer2_[getone::kMaxAnswerBuffer]; char answer_buffer3_[getone::kMaxAnswerBuffer]; char answer_buffer4_[getone::kMaxAnswerBuffer]; private: const char* start_byte_; const char* next_byte_limit_; const char* next_byte_; const char* onelangspan_begin_; int byte_length_; int script_; Language spanlang_; int smoothwidth_; int smoothwidth_2_; int smoothcandidates_; int maxlangs_; int minlangspan_; int rb_size_; int next_rb_; int rb_mask_; uint32* rb_; int* offset_rb_; }; #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_