// Copyright 2014 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include "cld2_dynamic_data.h" #include "cld2_dynamic_data_extractor.h" #include "cld2_dynamic_data_loader.h" // for verifying the written data #include "integral_types.h" #include "cld2tablesummary.h" #include "utf8statetable.h" using namespace std; namespace CLD2DynamicDataExtractor { static int DEBUG=0; void setDebug(int debug) { DEBUG=debug; } int advance(FILE* f, CLD2::uint32 position) { const char ZERO = 0; int pad = position - ftell(f); if (DEBUG) cout << " Adding " << pad << " bytes of padding" << endl; while (pad-- > 0) { fwrite(&ZERO,1,1,f); } return pad; } void writeChunk(FILE *f, const void* data, CLD2::uint32 startAt, CLD2::uint32 length) { if (DEBUG) cout << "Write chunk @" << startAt << ", len=" << length << endl; advance(f, startAt); if (DEBUG) cout << " Writing " << length << " bytes of data" << endl; fwrite(data, 1, length, f); } void writeDataFile(const CLD2::ScoringTables* data, const CLD2DynamicData::Supplement* supplement, const char* fileName) { // The order here is hardcoded and MUST NOT BE CHANGED, else you will de-sync // with the reading code. const char ZERO = 0; const int NUM_TABLES = 7; const CLD2::CLD2TableSummary* tableSummaries[NUM_TABLES]; tableSummaries[0] = data->unigram_compat_obj; tableSummaries[1] = data->deltabi_obj; tableSummaries[2] = data->distinctbi_obj; tableSummaries[3] = data->quadgram_obj; tableSummaries[4] = data->quadgram_obj2; tableSummaries[5] = data->deltaocta_obj; tableSummaries[6] = data->distinctocta_obj; CLD2DynamicData::TableHeader tableHeaders[NUM_TABLES]; CLD2DynamicData::FileHeader fileHeader; fileHeader.numTablesEncoded = NUM_TABLES; fileHeader.tableHeaders = tableHeaders; initUtf8Headers(&fileHeader, data->unigram_obj); initDeltaHeaders(&fileHeader, supplement->lengthOf_kAvgDeltaOctaScore); initTableHeaders(tableSummaries, NUM_TABLES, supplement, tableHeaders); alignAll(&fileHeader, 16); // Align all sections to 128-bit boundaries // We are ready to rock. for (int x=0; xunigram_obj->state_table, fileHeader.startOf_utf8PropObj_state_table, fileHeader.lengthOf_utf8PropObj_state_table); // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure writeChunk(outFile, data->unigram_obj->remap_base, fileHeader.startOf_utf8PropObj_remap_base, fileHeader.lengthOf_utf8PropObj_remap_base); writeChunk(outFile, data->unigram_obj->remap_string, fileHeader.startOf_utf8PropObj_remap_string, fileHeader.lengthOf_utf8PropObj_remap_string - 1); fwrite(&ZERO,1,1,outFile); // null terminator if (fileHeader.startOf_utf8PropObj_fast_state > 0) { writeChunk(outFile, data->unigram_obj->fast_state, fileHeader.startOf_utf8PropObj_fast_state, fileHeader.lengthOf_utf8PropObj_fast_state - 1); fwrite(&ZERO,1,1,outFile); // null terminator } // 2. kAvgDeltaOctaScore array writeChunk(outFile, data->kExpectedScore, fileHeader.startOf_kAvgDeltaOctaScore, fileHeader.lengthOf_kAvgDeltaOctaScore); // 3. Each table for (int x=0; xkCLDTable, tHeader.startOf_kCLDTable, tHeader.lengthOf_kCLDTable); writeChunk(outFile, summary->kCLDTableInd, tHeader.startOf_kCLDTableInd, tHeader.lengthOf_kCLDTableInd); writeChunk(outFile, summary->kRecognizedLangScripts, tHeader.startOf_kRecognizedLangScripts, tHeader.lengthOf_kRecognizedLangScripts - 1); fwrite(&ZERO,1,1,outFile); // null terminator } fclose(outFile); } void initTableHeaders(const CLD2::CLD2TableSummary** summaries, const int numSummaries, const CLD2DynamicData::Supplement* supplement, CLD2DynamicData::TableHeader* tableHeaders) { // Important: As documented in the .h, we assume that the Supplement data // structure contains exactly one entry in indirectTableSizes for each // CLD2TableSummary, in the same order. for (int x=0; xkCLDTableSizeOne; tableHeader.kCLDTableSize = summary->kCLDTableSize; tableHeader.kCLDTableKeyMask = summary->kCLDTableKeyMask; tableHeader.kCLDTableBuildDate = summary->kCLDTableBuildDate; // Calculate size information CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4); CLD2::uint32 numBuckets = summary->kCLDTableSize; CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets; CLD2::uint32 indirectTableSizeBytes = supplement->indirectTableSizes[x] * sizeof(CLD2::uint32); CLD2::uint32 recognizedScriptsSizeBytes = strlen(summary->kRecognizedLangScripts) + 1; // note null terminator // Place size information into header. We'll align on byte boundaries later. tableHeader.lengthOf_kCLDTable = tableSizeBytes; tableHeader.lengthOf_kCLDTableInd = indirectTableSizeBytes; tableHeader.lengthOf_kRecognizedLangScripts = recognizedScriptsSizeBytes; // null terminator counted above } } // Assuming that all fields have been set in the specified header, re-align // the starting positions of all data chunks to be aligned along 64-bit // boundaries for maximum efficiency. void alignAll(CLD2DynamicData::FileHeader* header, const int alignment) { CLD2::uint32 totalPadding = 0; if (DEBUG) { std::cout << "Align for " << (alignment*8) << " bits." << std::endl; } CLD2::uint32 headerSize = CLD2DynamicData::calculateHeaderSize( header->numTablesEncoded); CLD2::uint32 offset = headerSize; { // scoping block int stateTablePad = alignment - (offset % alignment); if (stateTablePad == alignment) stateTablePad = 0; totalPadding += stateTablePad; if (DEBUG) { std::cout << "Alignment for stateTable adjusted by " << stateTablePad << std::endl; } offset += stateTablePad; header->startOf_utf8PropObj_state_table = offset; offset += header->lengthOf_utf8PropObj_state_table; } { // scoping block int remapPad = alignment - (offset % alignment); if (remapPad == alignment) remapPad = 0; totalPadding += remapPad; if (DEBUG) { std::cout << "Alignment for remap adjusted by " << remapPad << std::endl; } offset += remapPad; header->startOf_utf8PropObj_remap_base = offset; offset += header->lengthOf_utf8PropObj_remap_base; } { // scoping block int remapStringPad = alignment - (offset % alignment); if (remapStringPad == alignment) remapStringPad = 0; totalPadding += remapStringPad; if (DEBUG) { std::cout << "Alignment for remapString adjusted by " << remapStringPad << std::endl; } offset += remapStringPad; header->startOf_utf8PropObj_remap_string = offset; offset += header->lengthOf_utf8PropObj_remap_string; // null terminator already counted in initUtf8Headers } { // scoping block int fastStatePad = alignment - (offset % alignment); if (fastStatePad == alignment) fastStatePad = 0; totalPadding += fastStatePad; if (DEBUG) { std::cout << "Alignment for fastState adjusted by " << fastStatePad << std::endl; } offset += fastStatePad; if (header->lengthOf_utf8PropObj_fast_state > 0) { header->startOf_utf8PropObj_fast_state = offset; offset += header->lengthOf_utf8PropObj_fast_state; // null terminator already counted in initUtf8Headers } else { header->startOf_utf8PropObj_fast_state = 0; } } { // scoping block int deltaOctaPad = alignment - (offset % alignment); if (deltaOctaPad == alignment) deltaOctaPad = 0; totalPadding += deltaOctaPad; if (DEBUG) { std::cout << "Alignment for deltaOctaScore adjusted by " << deltaOctaPad << std::endl; } offset += deltaOctaPad; header->startOf_kAvgDeltaOctaScore = offset; offset += header->lengthOf_kAvgDeltaOctaScore; } for (int x=0; xnumTablesEncoded; x++) { CLD2DynamicData::TableHeader& tableHeader = header->tableHeaders[x]; int tablePad = alignment - (offset % alignment); if (tablePad == alignment) tablePad = 0; totalPadding += tablePad; if (DEBUG) { std::cout << "Alignment for table " << x << " adjusted by " << tablePad << std::endl; } offset += tablePad; tableHeader.startOf_kCLDTable = offset; offset += tableHeader.lengthOf_kCLDTable; int indirectPad = alignment - (offset % alignment); if (indirectPad == alignment) indirectPad = 0; totalPadding += indirectPad; if (DEBUG) { std::cout << "Alignment for tableInd " << x << " adjusted by " << indirectPad << std::endl; } offset += indirectPad; tableHeader.startOf_kCLDTableInd = offset; offset += tableHeader.lengthOf_kCLDTableInd; int scriptsPad = alignment - (offset % alignment); if (scriptsPad == alignment) scriptsPad = 0; totalPadding += scriptsPad; if (DEBUG) { std::cout << "Alignment for scriptsPad " << x << " adjusted by " << scriptsPad << std::endl; } offset += scriptsPad; tableHeader.startOf_kRecognizedLangScripts = offset; offset += tableHeader.lengthOf_kRecognizedLangScripts; // null terminator already counted in initTableHeaders } // Now that we know exactly how much data we have written, store it in the // header as a sanity check header->totalFileSizeBytes = offset; if (DEBUG) { std::cout << "Data aligned." << std::endl; std::cout << "Header size: " << headerSize << " bytes " << std::endl; std::cout << "Data size: " << (offset - totalPadding) << " bytes" << std::endl; std::cout << "Padding size: " << totalPadding << " bytes" << std::endl; std::cout << " cld_generated_CjkUni_obj: " << ( header->lengthOf_utf8PropObj_state_table + header->lengthOf_utf8PropObj_remap_string + header->lengthOf_utf8PropObj_fast_state) << " bytes " << std::endl; std::cout << " kAvgDeltaOctaScore: " << header->lengthOf_kAvgDeltaOctaScore << " bytes " << std::endl; std::cout << " kCjkCompat_obj: " << ( header->tableHeaders[0].lengthOf_kCLDTable + header->tableHeaders[0].lengthOf_kCLDTableInd + header->tableHeaders[0].lengthOf_kRecognizedLangScripts + 1) << " bytes " << std::endl; std::cout << " kCjkDeltaBi_obj: " << ( header->tableHeaders[1].lengthOf_kCLDTable + header->tableHeaders[1].lengthOf_kCLDTableInd + header->tableHeaders[1].lengthOf_kRecognizedLangScripts + 1) << " bytes " << std::endl; std::cout << " kDistinctBiTable_obj: " << ( header->tableHeaders[2].lengthOf_kCLDTable + header->tableHeaders[2].lengthOf_kCLDTableInd + header->tableHeaders[2].lengthOf_kRecognizedLangScripts + 1) << " bytes " << std::endl; std::cout << " kQuad_obj: " << ( header->tableHeaders[3].lengthOf_kCLDTable + header->tableHeaders[3].lengthOf_kCLDTableInd + header->tableHeaders[3].lengthOf_kRecognizedLangScripts + 1) << " bytes " << std::endl; std::cout << " kQuad_obj2: " << ( header->tableHeaders[4].lengthOf_kCLDTable + header->tableHeaders[4].lengthOf_kCLDTableInd + header->tableHeaders[4].lengthOf_kRecognizedLangScripts + 1) << " bytes " << std::endl; std::cout << " kDeltaOcta_obj: " << ( header->tableHeaders[5].lengthOf_kCLDTable + header->tableHeaders[5].lengthOf_kCLDTableInd + header->tableHeaders[5].lengthOf_kRecognizedLangScripts + 1) << " bytes " << std::endl; std::cout << " kDistinctOcta_obj: " << ( header->tableHeaders[6].lengthOf_kCLDTable + header->tableHeaders[6].lengthOf_kCLDTableInd + header->tableHeaders[6].lengthOf_kRecognizedLangScripts + 1) << " bytes " << std::endl; } } void initDeltaHeaders(CLD2DynamicData::FileHeader* header, const CLD2::uint32 deltaLength) { header->startOf_kAvgDeltaOctaScore = 0; header->lengthOf_kAvgDeltaOctaScore = deltaLength; } void initUtf8Headers(CLD2DynamicData::FileHeader* header, const CLD2::UTF8PropObj* utf8Object) { header->utf8PropObj_state0 = utf8Object->state0; header->utf8PropObj_state0_size = utf8Object->state0_size; header->utf8PropObj_total_size = utf8Object->total_size; header->utf8PropObj_max_expand = utf8Object->max_expand; header->utf8PropObj_entry_shift = utf8Object->entry_shift; header->utf8PropObj_bytes_per_entry = utf8Object->bytes_per_entry; header->utf8PropObj_losub = utf8Object->losub; header->utf8PropObj_hiadd = utf8Object->hiadd; header->lengthOf_utf8PropObj_state_table = utf8Object->total_size; header->lengthOf_utf8PropObj_remap_base = sizeof(CLD2::RemapEntry); // TODO: Can this ever have more than one entry? header->lengthOf_utf8PropObj_remap_string = strlen( reinterpret_cast(utf8Object->remap_string)) + 1; // note null terminator if (utf8Object->fast_state == NULL) { header->lengthOf_utf8PropObj_fast_state = 0; // not applicable } else { header->lengthOf_utf8PropObj_fast_state = strlen( reinterpret_cast(utf8Object->fast_state)) + 1; // note null terminator } } } // End namespace CLD2DynamicDataExtractor