// Copyright 2014 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include #include #include #include #include #include "cld2_dynamic_data.h" #include "cld2_dynamic_data_loader.h" #include "integral_types.h" #include "cld2tablesummary.h" #include "utf8statetable.h" #include "scoreonescriptspan.h" namespace CLD2DynamicDataLoader { static int DEBUG=0; CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName) { FILE* inFile = fopen(fileName, "r"); if (inFile == NULL) { return NULL; } return loadInternal(inFile, NULL, -1); } CLD2DynamicData::FileHeader* loadHeaderFromRaw(const void* basePointer, const uint32_t length) { return loadInternal(NULL, basePointer, length); } #define CLD2_READINT(field) \ if (sourceIsFile) {\ bytesRead += 4 * fread(&(header->field), 4, 1, inFile);\ } else {\ memcpy(&(header->field), (((char*)(basePointer)) + bytesRead), 4);\ bytesRead += 4;\ } CLD2DynamicData::FileHeader* loadInternal(FILE* inFile, const void* basePointer, const uint32_t length) { const bool sourceIsFile = (inFile != NULL); int bytesRead = 0; CLD2DynamicData::FileHeader* header = new CLD2DynamicData::FileHeader; // TODO: force null-terminate char* strings for safety if (sourceIsFile) { bytesRead += fread(header->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile); } else { memcpy(header->sanityString, basePointer, CLD2DynamicData::DATA_FILE_MARKER_LENGTH); bytesRead += CLD2DynamicData::DATA_FILE_MARKER_LENGTH; } if (!CLD2DynamicData::mem_compare( header->sanityString, CLD2DynamicData::DATA_FILE_MARKER, CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) { std::cerr << "Malformed header: bad file marker!" << std::endl; delete header; return NULL; } CLD2_READINT(totalFileSizeBytes); CLD2_READINT(utf8PropObj_state0); CLD2_READINT(utf8PropObj_state0_size); CLD2_READINT(utf8PropObj_total_size); CLD2_READINT(utf8PropObj_max_expand); CLD2_READINT(utf8PropObj_entry_shift); CLD2_READINT(utf8PropObj_bytes_per_entry); CLD2_READINT(utf8PropObj_losub); CLD2_READINT(utf8PropObj_hiadd); CLD2_READINT(startOf_utf8PropObj_state_table); CLD2_READINT(lengthOf_utf8PropObj_state_table); CLD2_READINT(startOf_utf8PropObj_remap_base); CLD2_READINT(lengthOf_utf8PropObj_remap_base); CLD2_READINT(startOf_utf8PropObj_remap_string); CLD2_READINT(lengthOf_utf8PropObj_remap_string); CLD2_READINT(startOf_utf8PropObj_fast_state); CLD2_READINT(lengthOf_utf8PropObj_fast_state); CLD2_READINT(startOf_kAvgDeltaOctaScore); CLD2_READINT(lengthOf_kAvgDeltaOctaScore); CLD2_READINT(numTablesEncoded); CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[header->numTablesEncoded]; header->tableHeaders = tableHeaders; for (int x=0; xnumTablesEncoded; x++) { CLD2DynamicData::TableHeader *header = &(tableHeaders[x]); CLD2_READINT(kCLDTableSizeOne); CLD2_READINT(kCLDTableSize); CLD2_READINT(kCLDTableKeyMask); CLD2_READINT(kCLDTableBuildDate); CLD2_READINT(startOf_kCLDTable); CLD2_READINT(lengthOf_kCLDTable); CLD2_READINT(startOf_kCLDTableInd); CLD2_READINT(lengthOf_kCLDTableInd); CLD2_READINT(startOf_kRecognizedLangScripts); CLD2_READINT(lengthOf_kRecognizedLangScripts); } // Confirm header size is correct. int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(header->numTablesEncoded); if (expectedHeaderSize != bytesRead) { std::cerr << "Header size mismatch! Expected " << expectedHeaderSize << ", but read " << bytesRead << std::endl; delete header; delete tableHeaders; return NULL; } int actualSize = 0; if (sourceIsFile) { // Confirm file size is correct. fseek(inFile, 0, SEEK_END); actualSize = ftell(inFile); fclose(inFile); } else { actualSize = length; } if (actualSize != header->totalFileSizeBytes) { std::cerr << "File size mismatch! Expected " << header->totalFileSizeBytes << ", but found " << actualSize << std::endl; delete header; delete tableHeaders; return NULL; } return header; } void unloadDataFile(CLD2::ScoringTables** scoringTables, void** mmapAddress, uint32_t* mmapLength) { CLD2DynamicDataLoader::unloadDataRaw(scoringTables); munmap(*mmapAddress, *mmapLength); *mmapAddress = NULL; *mmapLength = 0; } void unloadDataRaw(CLD2::ScoringTables** scoringTables) { free(const_cast((*scoringTables)->unigram_obj)); (*scoringTables)->unigram_obj = NULL; delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile (*scoringTables)->unigram_compat_obj = NULL; delete(*scoringTables); *scoringTables = NULL; } CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, uint32_t* mmapLengthOut) { CLD2DynamicData::FileHeader* header = loadHeaderFromFile(fileName); if (header == NULL) { return NULL; } // Initialize the memory map int inFileHandle = open(fileName, O_RDONLY); void* mapped = mmap(NULL, header->totalFileSizeBytes, PROT_READ, MAP_PRIVATE, inFileHandle, 0); // Record the map address. This allows callers to unmap *mmapAddressOut=mapped; *mmapLengthOut=header->totalFileSizeBytes; close(inFileHandle); return loadDataInternal(header, mapped, header->totalFileSizeBytes); } CLD2::ScoringTables* loadDataRaw(const void* basePointer, const uint32_t length) { CLD2DynamicData::FileHeader* header = loadHeaderFromRaw(basePointer, length); return loadDataInternal(header, basePointer, length); } CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const uint32_t length) { // 1. UTF8 Object const CLD2::uint8* state_table = static_cast(basePointer) + header->startOf_utf8PropObj_state_table; // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure const CLD2::RemapEntry* remap_base = reinterpret_cast( static_cast(basePointer) + header->startOf_utf8PropObj_remap_base); const CLD2::uint8* remap_string = static_cast(basePointer) + header->startOf_utf8PropObj_remap_string; const CLD2::uint8* fast_state = header->startOf_utf8PropObj_fast_state == 0 ? 0 : static_cast(basePointer) + header->startOf_utf8PropObj_fast_state; // Populate intermediate object. Horrible casting required because the struct // is all read-only integers, and doesn't have a constructor. Yikes. // TODO: It might actually be less horrible to memcpy the data in const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast(malloc(sizeof(CLD2::UTF8PropObj))); *const_cast(&unigram_obj->state0) = header->utf8PropObj_state0; *const_cast(&unigram_obj->state0_size) = header->utf8PropObj_state0_size; *const_cast(&unigram_obj->total_size) = header->utf8PropObj_total_size; *const_cast(&unigram_obj->max_expand) = header->utf8PropObj_max_expand; *const_cast(&unigram_obj->entry_shift) = header->utf8PropObj_entry_shift; *const_cast(&unigram_obj->bytes_per_entry) = header->utf8PropObj_bytes_per_entry; *const_cast(&unigram_obj->losub) = header->utf8PropObj_losub; *const_cast(&unigram_obj->hiadd) = header->utf8PropObj_hiadd; *const_cast(&unigram_obj->state_table) = state_table; *const_cast(&unigram_obj->remap_base) = remap_base; *const_cast(&unigram_obj->remap_string) = remap_string; *const_cast(&unigram_obj->fast_state) = fast_state; // 2. kAvgDeltaOctaScore array const short* read_kAvgDeltaOctaScore = reinterpret_cast( static_cast(basePointer) + header->startOf_kAvgDeltaOctaScore); // 3. Each table CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[header->numTablesEncoded]; for (int x=0; xnumTablesEncoded; x++) { CLD2::CLD2TableSummary &summary = tableSummaries[x]; CLD2DynamicData::TableHeader& tHeader = header->tableHeaders[x]; const CLD2::IndirectProbBucket4* kCLDTable = reinterpret_cast( static_cast(basePointer) + tHeader.startOf_kCLDTable); const CLD2::uint32* kCLDTableInd = reinterpret_cast( static_cast(basePointer) + tHeader.startOf_kCLDTableInd); const char* kRecognizedLangScripts = static_cast(basePointer) + tHeader.startOf_kRecognizedLangScripts; summary.kCLDTable = kCLDTable; summary.kCLDTableInd = kCLDTableInd; summary.kCLDTableSizeOne = tHeader.kCLDTableSizeOne; summary.kCLDTableSize = tHeader.kCLDTableSize; summary.kCLDTableKeyMask = tHeader.kCLDTableKeyMask; summary.kCLDTableBuildDate = tHeader.kCLDTableBuildDate; summary.kRecognizedLangScripts = kRecognizedLangScripts; } // Tie everything together CLD2::ScoringTables* result = new CLD2::ScoringTables; result->unigram_obj = unigram_obj; result->unigram_compat_obj = &tableSummaries[0]; result->deltabi_obj = &tableSummaries[1]; result->distinctbi_obj = &tableSummaries[2]; result->quadgram_obj = &tableSummaries[3]; result->quadgram_obj2 = &tableSummaries[4]; result->deltaocta_obj = &tableSummaries[5]; result->distinctocta_obj = &tableSummaries[6]; result->kExpectedScore = read_kAvgDeltaOctaScore; delete header->tableHeaders; delete header; return result; } } // namespace CLD2DynamicDataLoader