// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: dsites@google.com (Dick Sites) // // Test: Do encoding detection on input file // --line treat each line as a separate detection problem #include // for sqrt #include // for exit #include #include #include // for gettimeofday #include #include "cld2tablesummary.h" #include "compact_lang_det_impl.h" #include "debug.h" #include "integral_types.h" #include "lang_script.h" #include "utf8statetable.h" namespace CLD2 { using namespace std; // Scaffolding typedef int32 Encoding; static const Encoding UNKNOWN_ENCODING = 0; #ifndef CLD2_DYNAMIC_MODE // Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc // These are here JUST for printing versions extern const UTF8PropObj cld_generated_CjkUni_obj; extern const CLD2TableSummary kCjkDeltaBi_obj; extern const CLD2TableSummary kDistinctBiTable_obj; extern const CLD2TableSummary kQuad_obj; extern const CLD2TableSummary kDeltaOcta_obj; extern const CLD2TableSummary kDistinctOcta_obj; extern const CLD2TableSummary kOcta2_obj; extern const short kAvgDeltaOctaScore[]; #endif bool FLAGS_cld_version = false; bool FLAGS_cld_html = true; int32 FLAGS_repeat = 1; bool FLAGS_plain = false; bool FLAGS_dbgscore = true; // Convert GetTimeOfDay output to 64-bit usec static inline uint64 Microseconds(const struct timeval& t) { // Convert to (uint64) microseconds, not (double) seconds. return t.tv_sec * 1000000ULL + t.tv_usec; } #define LF 0x0a #define CR 0x0d bool Readline(FILE* infile, char* buffer) { char* p = fgets(buffer, 64 * 1024, infile); if (p == NULL) { return false; } int len = strlen(buffer); // trim CR LF if (buffer[len-1] == LF) {buffer[--len] = '\0';} if (buffer[len-1] == CR) {buffer[--len] = '\0';} return true; } bool IsComment(char* buffer) { int len = strlen(buffer); if (len == 0) {return true;} if (buffer[0] == '#') {return true;} if (buffer[0] == ' ') {return true;} // Any leading space is comment return false; } void DumpExtLang(int flags, Language summary_lang, Language* language3, int* percent3, double* normalized_score3, int text_bytes, bool is_reliable, int in_size) { char temp[160]; char* tp = temp; int tp_left = sizeof(temp); snprintf(tp, tp_left, "ExtLanguage"); if (language3[0] != UNKNOWN_LANGUAGE) { tp = temp + strlen(temp); tp_left = sizeof(temp) - strlen(temp); snprintf(tp, tp_left, " %s(%d%% %3.0fp)", LanguageName(language3[0]), percent3[0], normalized_score3[0]); } if (language3[1] != UNKNOWN_LANGUAGE) { tp = temp + strlen(temp); tp_left = sizeof(temp) - strlen(temp); snprintf(tp, tp_left, ", %s(%d%% %3.0fp)", LanguageName(language3[1]), percent3[1], normalized_score3[1]); } if (language3[2] != UNKNOWN_LANGUAGE) { tp = temp + strlen(temp); tp_left = sizeof(temp) - strlen(temp); snprintf(tp, tp_left, ", %s(%d%% %3.0fp)", LanguageName(language3[2]), percent3[2], normalized_score3[2]); } if (text_bytes > 9999) { tp = temp + strlen(temp); tp_left = sizeof(temp) - strlen(temp); snprintf(tp, tp_left, ", %d/%d KB of non-tag letters", text_bytes >> 10, in_size >> 10); } else { tp = temp + strlen(temp); tp_left = sizeof(temp) - strlen(temp); snprintf(tp, tp_left, ", %d/%d bytes of non-tag letters", text_bytes, in_size); } tp = temp + strlen(temp); tp_left = sizeof(temp) - strlen(temp); snprintf(tp, tp_left, ", Summary: %s%s", LanguageName(summary_lang), is_reliable ? "" : "*"); printf("%s\n", temp); // Also put into optional HTML output if ((flags & kCLDFlagHtml) != 0) { fprintf(stderr, "%s\n", temp); } } void DumpLanguages(Language summary_lang, Language* language3, int* percent3, int text_bytes, bool is_reliable, int in_size) { // fprintf(stderr, "\n\n"); int total_percent = 0; if (language3[0] != UNKNOWN_LANGUAGE) { fprintf(stderr, "\n
Languages %s(%d%%)", LanguageName(language3[0]), percent3[0]); total_percent += percent3[0]; } else { fprintf(stderr, "\n
Languages "); } if (language3[1] != UNKNOWN_LANGUAGE) { fprintf(stderr, ", %s(%d%%)", LanguageName(language3[1]), percent3[1]); total_percent += percent3[1]; } if (language3[2] != UNKNOWN_LANGUAGE) { fprintf(stderr, ", %s(%d%%)", LanguageName(language3[2]), percent3[2]); total_percent += percent3[2]; } fprintf(stderr, ", other(%d%%)", 100 - total_percent); if (text_bytes > 9999) { fprintf(stderr, ", %d/%d KB of non-tag letters", text_bytes >> 10, in_size >> 10); } else { fprintf(stderr, ", %d/%d bytes of non-tag letters", text_bytes, in_size); } fprintf(stderr, ", Summary: %s%s ", LanguageName(summary_lang), is_reliable ? "" : "*"); fprintf(stderr, "
\n"); } int main(int argc, char** argv) { if (FLAGS_cld_version) { #ifndef CLD2_DYNAMIC_MODE printf("%s %4dKB uni build date, bytes\n", "........", cld_generated_CjkUni_obj.total_size >> 10); printf("%d %4ldKB delta_bi build date, bytes\n", kCjkDeltaBi_obj.kCLDTableBuildDate, (kCjkDeltaBi_obj.kCLDTableSize * sizeof(IndirectProbBucket4)) >> 10); printf("%d %4ldKB quad build date, bytes\n", kQuad_obj.kCLDTableBuildDate, (kQuad_obj.kCLDTableSize * sizeof(IndirectProbBucket4)) >> 10); printf("%d %4ldKB delta_octa build date, bytes\n", kDeltaOcta_obj.kCLDTableBuildDate, (kDeltaOcta_obj.kCLDTableSize * sizeof(IndirectProbBucket4)) >> 10); #else printf("FLAGS_cld_version doesn't work with dynamic data mode\n"); #endif exit(0); } // End FLAGS_cld_version int flags = 0; bool get_vector = false; const char* data_file = NULL; const char* fname = NULL; for (int i = 1; i < argc; ++i) { if (argv[i][0] != '-') {fname = argv[i];} if (strcmp(argv[i], "--scoreasquads") == 0) {flags |= kCLDFlagScoreAsQuads;} if (strcmp(argv[i], "--html") == 0) {flags |= kCLDFlagHtml;} if (strcmp(argv[i], "--cr") == 0) {flags |= kCLDFlagCr;} if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;} if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;} if (strcmp(argv[i], "--vector") == 0) {get_vector = true;} if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];} } #ifdef CLD2_DYNAMIC_MODE if (data_file == NULL) { fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n"); return -1; } fprintf(stdout, "Loading data from: %s\n", data_file); CLD2::loadDataFromFile(data_file); fprintf(stdout, "Data loaded, test commencing\n"); #endif FILE* fin; if (fname == NULL) { fin = stdin; } else { fin = fopen(fname, "rb"); if (fin == NULL) { fprintf(stderr, "%s did not open\n", fname); exit(0); } } const char* tldhint = ""; Encoding enchint = UNKNOWN_ENCODING; Language langhint = UNKNOWN_LANGUAGE; int bytes_consumed; int bytes_filled; int error_char_count; bool is_reliable; int usec; char* buffer = new char[10000000]; // Max 10MB of input for this test program struct timeval news, newe; if ((flags & kCLDFlagHtml) != 0) { // Begin HTML file fprintf(stderr, "\n"); fprintf(stderr, "\n"); fprintf(stderr, "\n"); } if ((flags & kCLDFlagHtml) != 0) { //// fprintf(stderr, "\n"); //// fprintf(stderr, "
\n");
    fprintf(stderr, "file = %s
\n", fname ? fname : "stdin"); } // Full-blown flag-bit and hints interface bool allow_extended_lang = true; Language plus_one = UNKNOWN_LANGUAGE; int n = fread(buffer, 1, 10000000, fin); bool ignore_7bit = false; // Detect language Language summary_lang = UNKNOWN_LANGUAGE; Language language3[3]; int percent3[3]; double normalized_score3[3]; ResultChunkVector resultchunkvector; bool is_plain_text = FLAGS_plain; int text_bytes; CLDHints cldhints = {NULL, tldhint, enchint, langhint}; gettimeofday(&news, NULL); for (int i = 0; i < FLAGS_repeat; ++i) { summary_lang = CLD2::DetectLanguageSummaryV2( buffer, n, is_plain_text, &cldhints, allow_extended_lang, flags, plus_one, language3, percent3, normalized_score3, get_vector ? &resultchunkvector : NULL, &text_bytes, &is_reliable); } gettimeofday(&newe, NULL); if (get_vector) { DumpResultChunkVector(stderr, buffer, &resultchunkvector); } DumpExtLang(flags, summary_lang, language3, percent3, normalized_score3, text_bytes, is_reliable, n); if ((flags & kCLDFlagHtml) != 0) { DumpLanguages(summary_lang, language3, percent3, text_bytes, is_reliable, n); } usec = static_cast(Microseconds(newe) - Microseconds(news)); if (usec == 0) {usec = 1;} printf(" SummaryLanguage %s%s at %u of %d %uus (%d MB/sec), %s\n", LanguageName(summary_lang), is_reliable ? "" : "(un-reliable)", bytes_consumed, n, usec, n / usec, argv[1]); if ((flags & kCLDFlagHtml) != 0) { fprintf(stderr, "\n
"); } fclose(fin); delete[] buffer; return 0; } } // End namespace CLD2 int main(int argc, char *argv[]) { return CLD2::main(argc, argv); }