// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: dsites@google.com (Dick Sites) // #include "debug.h" #include #include #include "cldutil.h" #include "getonescriptspan.h" #include "lang_script.h" using namespace std; namespace CLD2 { // Debug output string of one unigram string GetUniAt(const char* text) { string retval; retval.clear(); int uni_len = UniLen(text); retval.append(text, uni_len); return retval; } // Debug output string of one bigram string GetBiAt(const char* text) { string retval; retval.clear(); int bi_len = BiLen(text); retval.append(text, bi_len); return retval; } // Debug output string of one quadgram, including underscores string GetQuadAt(const char* text) { string retval; retval.clear(); if (text[-1] == ' ') {retval.append("_");} int quad_len = QuadLen(text); retval.append(text, quad_len); if (text[quad_len] == ' ') {retval.append("_");} return retval; } // Debug output string of one octagram, including underscores string GetOctaAt(const char* text) { string retval; retval.clear(); if (text[-1] == ' ') {retval.append("_");} int octa_len = OctaLen(text); retval.append(text, octa_len); if (text[octa_len] == ' ') {retval.append("_");} return retval; } // Debug output string of two octagrams, including underscores string GetOcta2At(const char* text) { string retval; retval.clear(); if (text[-1] == ' ') {retval.append("_");} int octa_len = OctaLen(text); retval.append(text, octa_len); if (text[octa_len] == ' ') {retval.append("_");} text += (octa_len + 1); int octa2_len = OctaLen(text); retval.append(text, octa2_len); if (text[octa2_len] == ' ') {retval.append("_");} return retval; } // Debug output string of one formatted pslang,qprob pair string FmtLP(ULScript ulscript, uint8 pslang, uint8 qprob) { string retval; retval.clear(); Language lang = FromPerScriptNumber(ulscript, pslang); char temp[16]; sprintf(temp, "%s.%d", LanguageCode(lang), qprob); retval.append(temp); return retval; } // Debug output string of one formatted langprob // Returns "en.24 fr.10 es.4" string GetLangProbTxt(const ScoringContext* scoringcontext, uint32 langprob) { /*const uint16* pslangtolang = scoringcontext->pslangtolang;*/ string retval; retval.clear(); uint8 prob123 = (langprob >> 0) & 0xff; const uint8* prob123_entry = LgProb2TblEntry(prob123); uint8 top1 = (langprob >> 8) & 0xff; if (top1 > 0) { retval.append(FmtLP(scoringcontext->ulscript, top1, LgProb3(prob123_entry, 0))); } uint8 top2 = (langprob >> 16) & 0xff; if (top2 > 0) { if (!retval.empty()) {retval.append("~");} retval.append(FmtLP(scoringcontext->ulscript, top2, LgProb3(prob123_entry, 1))); } uint8 top3 = (langprob >> 24) & 0xff; if (top3 > 0) { if (!retval.empty()) {retval.append("~");} retval.append(FmtLP(scoringcontext->ulscript, top3, LgProb3(prob123_entry, 2))); } return retval; } // Debug output string of one or two formatted quadgram langprobs string GetScoreTxt(const ScoringContext* scoringcontext, const CLD2TableSummary* base_obj, int indirect) { string retval; retval.clear(); if (indirect < static_cast(base_obj->kCLDTableSizeOne)) { // Up to three languages at indirect uint32 langprob = base_obj->kCLDTableInd[indirect]; retval.append(GetLangProbTxt(scoringcontext, langprob)); } else { // Up to six languages at start + 2 * (indirect - start) indirect += (indirect - base_obj->kCLDTableSizeOne); uint32 langprob = base_obj->kCLDTableInd[indirect]; uint32 langprob2 = base_obj->kCLDTableInd[indirect + 1]; retval.append(GetLangProbTxt(scoringcontext, langprob)); if (!retval.empty()) {retval.append("~");} retval.append(GetLangProbTxt(scoringcontext, langprob2)); } return retval; } // 16 background colors, perhaps from the low 4 bits of the language number static const int kLangBackground[16] = { 0xffd8d8, 0xf8ffd8, 0xd8ffe7, 0xd8f3ff, 0xefd8ff, 0xffd8eb, 0xfff7d8, 0xe3ffd8, 0xd8ffff, 0xe3d8ff, 0xffd8f7, 0xffebd8, 0xefffd8, 0xd8fff3, 0xd8e7ff, 0xf8d8ff, }; // 16 text colors, perhaps from the high 4 bits of the language number // 00..7f static const int kLangColor[16] = { 0x000000, 0x7f2f00, 0x7f5f00, 0x6f7f00, // first 16 lang: black text 0x3f7f00, 0x0f7f00, 0x007f1f, 0x007f4f, 0x007f7f, 0x004f7f, 0x001f7f, 0x0f007f, 0x3f007f, 0x6f007f, 0x7f005f, 0x7f002f, }; static const int kUnscoredText = 0xb0b0b0; // medium-light gray static const int kUnscoredBackground = 0xffffff; // white static const int kIgnoremeText = 0x8090a0; // medium-light green-gray static const int kIgnoremeBackground = 0xffeecc; // light orange static const int kEnglishBackground = 0xfffff4; // very light yellow static int GetBackColor(Language lang, bool lighten) { int retval; if (lang == ENGLISH) { retval = kEnglishBackground; } else if (lang == UNKNOWN_LANGUAGE) { retval = kUnscoredBackground; } else if (lang == TG_UNKNOWN_LANGUAGE) { retval = kIgnoremeBackground; } else if (lang < 0) { retval = kUnscoredBackground; } else { retval = kLangBackground[lang & 0x0f]; } if (lighten) { // Make 1/2 as far away from white retval = (retval >> 1) | 0x808080; } return retval; } static int GetTextColor(Language lang, bool lighten) { int retval; if (lang == UNKNOWN_LANGUAGE) { retval = kUnscoredText; } else if (lang == TG_UNKNOWN_LANGUAGE) { retval = kIgnoremeText; } else if (lang < 0) { retval = kUnscoredText; } else { retval = kLangColor[(lang >> 4) & 0x0f]; } if (lighten) { // Make 1/2 as far away from white retval = (retval >> 1) | 0x808080; } return retval; } string GetPlainEscapedText(const string& txt) { string retval; retval.clear(); for (int i = 0; i < static_cast(txt.size()); ++i) { char c = txt[i]; if (c == '\n') { retval.append(" "); } else if (c == '\r') { retval.append(" "); } else { retval.append(1, c); } } return retval; } string GetHtmlEscapedText(const string& txt) { string retval; retval.clear(); for (int i = 0; i < static_cast(txt.size()); ++i) { char c = txt[i]; if (c == '<') { retval.append("<"); } else if (c == '>') { retval.append(">"); } else if (c == '&') { retval.append("&"); } else if (c == '\'') { retval.append("'"); } else if (c == '"') { retval.append("""); } else if (c == '\n') { retval.append(" "); } else if (c == '\r') { retval.append(" "); } else { retval.append(1, c); } } return retval; } string GetColorHtmlEscapedText(Language lang, const string& txt) { char temp[64]; sprintf(temp, " \n", GetBackColor(lang, false), GetTextColor(lang, false)); string esc_txt = string(temp); esc_txt.append(GetHtmlEscapedText(txt)); esc_txt.append(""); return esc_txt; } string GetLangColorHtmlEscapedText(Language lang, const string& txt) { char temp[64]; sprintf(temp, "[%s]", LanguageCode(lang)); string esc_txt = string(temp); esc_txt.append(GetColorHtmlEscapedText(lang, txt)); return esc_txt; } // For showing one chunk // Print debug output for one scored chunk // Optionally print out per-chunk scoring information // In degenerate cases, hitbuffer and cspan can be NULL void CLD2_Debug(const char* text, int lo_offset, int hi_offset, bool more_to_come, bool score_cjk, const ScoringHitBuffer* hitbuffer, const ScoringContext* scoringcontext, const ChunkSpan* cspan, const ChunkSummary* chunksummary) { FILE* df = scoringcontext->debug_file; if (df == NULL) {return;} if (scoringcontext->flags_cld2_verbose && (hitbuffer != NULL) && (cspan != NULL) && (hitbuffer->next_linear > 0)) { int base_limit = cspan->chunk_base + cspan->base_len; for (int i = cspan->chunk_base; i < base_limit; ++i) { int ngram_start = hitbuffer->linear[i].offset; uint32 langprob = hitbuffer->linear[i].langprob; string ngram_text; switch (hitbuffer->linear[i].type) { case UNIHIT: ngram_text = GetUniAt(&text[ngram_start]); break; case QUADHIT: ngram_text = GetQuadAt(&text[ngram_start]); break; case DELTAHIT: case DISTINCTHIT: if (score_cjk) { ngram_text = GetBiAt(&text[ngram_start]); } else { // TODO: figure out how to display optional two words ngram_text = GetOctaAt(&text[ngram_start]); } break; } string score_text = GetLangProbTxt(scoringcontext, langprob); fprintf(df, "%c:%s=%s   ", "UQLD"[hitbuffer->linear[i].type], ngram_text.c_str(), score_text.c_str()); } fprintf(df, "
\n"); // Score boosts for langprior and distinct tokens // Get boosts for current script const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; if (scoringcontext->ulscript != ULScript_Latin) { langprior_boost = &scoringcontext->langprior_boost.othr; langprior_whack = &scoringcontext->langprior_whack.othr; distinct_boost = &scoringcontext->distinct_boost.othr; } fprintf(df, "LangPrior_boost: "); for (int k = 0; k < kMaxBoosts; ++k) { uint32 langprob = langprior_boost->langprob[k]; if (langprob > 0) { fprintf(df, "%s   ", GetLangProbTxt(scoringcontext, langprob).c_str()); } } fprintf(df, "LangPrior_whack: "); for (int k = 0; k < kMaxBoosts; ++k) { uint32 langprob = langprior_whack->langprob[k]; if (langprob > 0) { fprintf(df, "%s   ", GetLangProbTxt(scoringcontext, langprob).c_str()); } } fprintf(df, "Distinct_boost: "); for (int k = 0; k < kMaxBoosts; ++k) { uint32 langprob = distinct_boost->langprob[k]; if (langprob > 0) { fprintf(df, "%s   ", GetLangProbTxt(scoringcontext, langprob).c_str()); } } fprintf(df, "
\n"); // Print chunksummary fprintf(df, "%s.%d %s.%d %dB %d# %s %dRd %dRs
\n", LanguageCode(static_cast(chunksummary->lang1)), chunksummary->score1, LanguageCode(static_cast(chunksummary->lang2)), chunksummary->score2, chunksummary->bytes, chunksummary->grams, ULScriptCode(static_cast(chunksummary->ulscript)), chunksummary->reliability_delta, chunksummary->reliability_score); } // End flags_cld2_verbose linear // Print annotated colored text of this chunk bool is_reliable = true; bool match_prior = false; int reliable = CLD2::minint(chunksummary->reliability_delta, chunksummary->reliability_score); is_reliable = (reliable >= 75); match_prior = (chunksummary->lang1 == scoringcontext->prior_chunk_lang); if (!is_reliable) {match_prior = false;} if (match_prior) { fprintf(df, "[]"); } else if (is_reliable) { fprintf(df, "[%s]", LanguageCode(static_cast(chunksummary->lang1))); } else { fprintf(df, "[%s*.%d/%s.%d]", LanguageCode(static_cast(chunksummary->lang1)), chunksummary->score1, LanguageCode(static_cast(chunksummary->lang2)), chunksummary->score2); } int chunktext_len = hi_offset - lo_offset; if (chunktext_len < 0) { chunktext_len = 0; fprintf(df, " LEN_ERR hi %d lo %d
\n", hi_offset, lo_offset); } string chunk_text(&text[lo_offset], chunktext_len); Language lang = static_cast(chunksummary->lang1); fprintf(df, " \n", GetBackColor(lang, false), GetTextColor(lang, false)); fprintf(df, "%s", chunk_text.c_str()); if (scoringcontext->flags_cld2_cr) { fprintf(df, "
\n"); } else { fprintf(df, " \n"); } } // For showing all chunks void CLD2_Debug2(const char* text, bool more_to_come, bool score_cjk, const ScoringHitBuffer* hitbuffer, const ScoringContext* scoringcontext, const SummaryBuffer* summarybuffer) { FILE* df = scoringcontext->debug_file; if (df == NULL) {return;} uint16 prior_chunk_lang = static_cast(UNKNOWN_LANGUAGE); for (int i = 0; i < summarybuffer->n; ++i) { fprintf(df, "Debug2[%d] ", i); const ChunkSummary* chunksummary = &summarybuffer->chunksummary[i]; // Print annotated colored text of this chunk bool is_reliable = true; bool match_prior = false; int reliable = CLD2::minint(chunksummary->reliability_delta, chunksummary->reliability_score); is_reliable = (reliable >= 75); match_prior = (chunksummary->lang1 == prior_chunk_lang); if (!is_reliable) {match_prior = false;} if (match_prior) { fprintf(df, "[]"); } else if (is_reliable) { fprintf(df, "[%s]", LanguageCode(static_cast(chunksummary->lang1))); } else { fprintf(df, "[%s*.%d/%s.%d]", LanguageCode(static_cast(chunksummary->lang1)), chunksummary->score1, LanguageCode(static_cast(chunksummary->lang2)), chunksummary->score2); } int lo_offset = chunksummary->offset; int chunktext_len = chunksummary->bytes; string chunk_text(&text[lo_offset], chunktext_len); Language lang = static_cast(chunksummary->lang1); fprintf(df, " \n", GetBackColor(lang, false), GetTextColor(lang, false)); fprintf(df, "%s", chunk_text.c_str()); if (scoringcontext->flags_cld2_cr) { fprintf(df, "
\n"); } else { fprintf(df, " \n"); } prior_chunk_lang = chunksummary->lang1; } } void DumpResultChunkVector(FILE* f, const char* src, ResultChunkVector* resultchunkvector) { fprintf(f, "DumpResultChunkVector[%ld]
\n", resultchunkvector->size()); for (int i = 0; i < static_cast(resultchunkvector->size()); ++i) { ResultChunk* rc = &(*resultchunkvector)[i]; Language lang1 = static_cast(rc->lang1); string this_chunk = string(src, rc->offset, rc->bytes); fprintf(f, "[%d]{%d %d %s} ", i, rc->offset, rc->bytes, LanguageCode(lang1)); fprintf(f, "%s
\n", GetColorHtmlEscapedText(lang1, this_chunk).c_str()); } fprintf(f, "
\n"); } } // End namespace CLD2