// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Little program to read lines of sample text, calculate score per 1024 bytes // per language-script4 combination // Possible input file /export/hda3/cld/pre2010/b0_samp_prune_20100722.utf8 #include #include #include #include "compact_lang_det_impl.h" #include "lang_script.h" using namespace std; using namespace CLD2; double bytes[NUM_LANGUAGES][4]; double scores[NUM_LANGUAGES][4]; // Return score per 1024 bytes for top language Language ScoreOneLine(const char* buffer, int buffer_length, int* bytes, double* score_per_1kb) { bool is_plain_text = true; const CLDHints* cld_hints = NULL; bool allow_extended_lang = true; int flags = 0; Language plus_one = UNKNOWN_LANGUAGE; Language language3[3]; int percent3[3]; double normalized_score3[3]; ResultChunkVector* resultchunkvector = NULL; int text_bytes; bool is_reliable; Language summary_lang; summary_lang = DetectLanguageSummaryV2( buffer, buffer_length, is_plain_text, cld_hints, allow_extended_lang, flags, plus_one, language3, percent3, normalized_score3, resultchunkvector, &text_bytes, &is_reliable); *bytes = text_bytes; *score_per_1kb = normalized_score3[0]; return language3[0]; } #define LF 0x0a #define CR 0x0d const int kMaxBuffer = 5 * 1024; bool ReadLine(FILE* infile, char* buffer, size_t maxlen) { char* p = fgets(buffer, maxlen, infile); if (p == NULL) { return false; } int len = strlen(buffer); // trim CR LF if (buffer[len-1] == LF) {buffer[--len] = '\0';} if (buffer[len-1] == CR) {buffer[--len] = '\0';} return true; } bool IsComment(const char* buffer) { int len = strlen(buffer); if (len == 0) {return true;} if (buffer[0] == '#') {return true;} if (buffer[0] == ' ') {return true;} // Any leading space is comment return false; } // Skips over xxxxx_ where _ is one or more spaces/tabs // Returns string::npos if no more fields int SkipOneField(const string& src, int pos) { if (pos == string::npos) {return pos;} int lpos = pos; lpos = src.find_first_of(" \t", lpos); if (lpos == string::npos) {return lpos;} lpos = src.find_first_not_of(" \t", lpos); if (lpos == string::npos) {return lpos;} return lpos; } // Return language and script from parsed line or defaults void GetLangScript(const string& src, Language default_lang, ULScript default_lscript, Language* target_lang, ULScript* target_lscript, string* tld) { *target_lang = default_lang; *target_lscript = default_lscript; *tld = ""; int pos = 0; int pos2 = 0; if (src.substr(0,7) == "SAMPLE ") { // SAMPLE ll-Ssss pos = SkipOneField(src, pos); } else if (src.substr(0,5) == "SAMP ") { // SAMP ll-Ssss /tld2.tld/ pos = SkipOneField(src, pos); pos2 = SkipOneField(src, pos); } else if (src.substr(0,5) == "Samp ") { // Samp ll-Ssss /tld2.tld/ pos = SkipOneField(src, pos); pos2 = SkipOneField(src, pos); } if (pos == 0) {return;} if (pos == string::npos) {return;} // Pos is at the first letter of language-script combination int end = src.find_first_of(" \t", pos); // find end of lang-script if (end == string::npos) {return;} *target_lang = GetLanguageFromName(src.substr(pos, end - pos).c_str()); *target_lscript = GetULScriptFromName(src.substr(pos, end - pos).c_str()); // Pos2 is 0 or at the first letter of the tld string if (pos2 == 0) {return;} if (pos2 == string::npos) {return;} end = src.find_first_of(" \t", pos2); if (end == string::npos) {return;} *tld = src.substr(pos2, end - pos2); } // Return position of start of text int GetTextBeginPos(const string& src) { int pos = 0; if (src.size() < 8) {return pos;} if (src.substr(0,7) == "SAMPLE ") { // Skip SAMPLE ll-Ssss pos = SkipOneField(src, pos); pos = SkipOneField(src, pos); } else if (src.substr(0,5) == "SAMP ") { // Skip SAMP ll-Ssss /tld2.tld/ pos = SkipOneField(src, pos); pos = SkipOneField(src, pos); pos = SkipOneField(src, pos); } else if (src.substr(0,5) == "Samp ") { // Skip Samp ll-Ssss /tld2.tld/ pos = SkipOneField(src, pos); pos = SkipOneField(src, pos); pos = SkipOneField(src, pos); } return pos; } // Avoid zdiv inline double Divisor(double x) { return (x > 0.0 ? x : 1.0); } void Flush(Language cur_lang, ULScript ulscript, double total_score_cur_lang, double total_bytes_cur_lang, double total_bad_bytes_cur_lang) { if (cur_lang == UNKNOWN_LANGUAGE) {return;} bytes[cur_lang][LScript4(ulscript)] += total_bytes_cur_lang; scores[cur_lang][LScript4(ulscript)] += total_score_cur_lang; double score = total_score_cur_lang * 1024.0 / Divisor(total_bytes_cur_lang); double percent_bad = 100.0 * total_bad_bytes_cur_lang / Divisor(total_bytes_cur_lang + total_bad_bytes_cur_lang); fprintf(stdout, "%s-%s %7.0f %6.1f, %2.0f%% bad SUMMARY\n\n", LanguageCode(cur_lang), ULScriptCode(ulscript), total_bytes_cur_lang, score, percent_bad); } int BytesPer1KB(int i, int j) { int bytes_per_1kb = ((scores[i][j] * 1024.0) / Divisor(bytes[i][j])) + 0.5; return bytes_per_1kb; } int main(int argc, char *argv[]) { Language cur_lang = UNKNOWN_LANGUAGE; ULScript cur_ulscript = ULScript_Common; double total_score_cur_lang = 0.0; double total_bytes_cur_lang = 0.0; double total_bad_bytes_cur_lang = 0.0; memset(bytes, 0, sizeof(bytes)); memset(scores, 0, sizeof(bytes)); char buffer[kMaxBuffer]; int buffer_length; const char* filename = NULL; FILE* infile = stdin; for (int i = 1; i < argc; ++i) { if (argv[i][0] != '-') { filename = argv[i]; } } if (filename != NULL) { infile = fopen(filename, "r"); if (infile == NULL) { fprintf(stderr, "%s did not open\n", filename); return 0; } } while (ReadLine(infile, buffer, kMaxBuffer)) { if (IsComment(buffer)) {continue;} buffer_length = strlen(buffer); int bytes; double score_per_1kb; Language toplang; Language target_lang; ULScript target_ulscript; string src(buffer, buffer_length); string tld(""); int pos = GetTextBeginPos(src); GetLangScript(src, UNKNOWN_LANGUAGE, ULScript_Common, &target_lang, &target_ulscript, &tld); if ((cur_lang != target_lang) || (cur_ulscript != target_ulscript)) { Flush(cur_lang, cur_ulscript, total_score_cur_lang, total_bytes_cur_lang, total_bad_bytes_cur_lang); cur_lang = target_lang; cur_ulscript = target_ulscript; total_score_cur_lang = 0.0; total_bytes_cur_lang = 0.0; total_bad_bytes_cur_lang = 0.0; } toplang = ScoreOneLine(&src[pos], src.size() - pos, &bytes, &score_per_1kb); fprintf(stdout, "%s%c %d %4.1f %s\n", LanguageCode(toplang), (toplang == target_lang) ? ' ' : '*', bytes, score_per_1kb, buffer); // Only count when detected lang matches the claimed target lang if (toplang == target_lang) { total_bytes_cur_lang += bytes; total_score_cur_lang += (score_per_1kb * bytes) / 1024.0; } else { total_bad_bytes_cur_lang += bytes; } } Flush(cur_lang, cur_ulscript, total_score_cur_lang, total_bytes_cur_lang, total_bad_bytes_cur_lang); for (int i = 0; i < NUM_LANGUAGES; ++i) { Language ilang = static_cast(i); fprintf(stdout, " {%4d, %4d, %4d, %4d}, // %d %s %s\n", BytesPer1KB(i, 0), BytesPer1KB(i, 1), BytesPer1KB(i, 2), BytesPer1KB(i, 3), i, LanguageName(ilang), LanguageCode(ilang)); } if (infile != stdin) { fclose(infile); } }