// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "encodings/compact_lang_det/win/cld_unicodetext.h" #include #include // to compile bar/common/component.h #include "encodings/compact_lang_det/compact_lang_det.h" #include "base/string_util.h" #include "unicode/normlzr.h" #include "unicode/unistr.h" #include "unicode/ustring.h" /* std::string NormalizeText(const UChar* text) { // To avoid a copy, use the read-only aliasing ctor. icu::UnicodeString source(1, text, -1); icu::UnicodeString normalized; UErrorCode status = U_ZERO_ERROR; icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status); if (U_FAILURE(status)) return std::string(); normalized.toLower(); std::string utf8; // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough // for most web pages) and does pre-flighting followed by malloc for larger // strings. We have to switch to obtaining the buffer with the maximum size // (UTF-16 length * 3) without pre-flighting if necessary. return normalized.toUTF8String(utf8); } */ // Detects a language of the UTF-16 encoded zero-terminated text. // Returns: Language enum. Language DetectLanguageOfUnicodeText( const CompactLangDet::DetectionTables* detection_tables, const UChar* text, bool is_plain_text, bool* is_reliable, int* num_languages, int* error_code, int* text_bytes) { if (!text || !num_languages) return NUM_LANGUAGES; // Normalize text to NFC, lowercase and convert to UTF-8. std::string utf8_encoded = NormalizeText(text); if (utf8_encoded.empty()) return NUM_LANGUAGES; // Engage core CLD library language detection. Language language3[3] = { UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE }; int percent3[3] = { 0, 0, 0 }; int text_bytes_tmp = 0; // We ignore return value here due to the problem described in bug 1800161. // For example, translate.google.com was detected as Indonesian. It happened // due to the heuristic in CLD, which ignores English as a top language // in the presence of another reliably detected language. // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function. // language3 array is always set according to the detection results and // is not affected by this heuristic. CompactLangDet::DetectLanguageSummary(detection_tables, utf8_encoded.c_str(), utf8_encoded.length(), is_plain_text, language3, percent3, &text_bytes_tmp, is_reliable); // Calcualte a number of languages detected in more than 20% of the text. const int kMinTextPercentToCountLanguage = 20; *num_languages = 0; if (text_bytes) *text_bytes = text_bytes_tmp; COMPILE_ASSERT(arraysize(language3) == arraysize(percent3), language3_and_percent3_should_be_of_the_same_size); for (int i = 0; i < arraysize(language3); ++i) { if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) && percent3[i] >= kMinTextPercentToCountLanguage) { ++*num_languages; } } return language3[0]; }