/////////////////////////////////////////////////////////////////////// // File: unicharmap.cpp // Description: Unicode character/ligature to integer id class. // Author: Thomas Kielbus // Created: Wed Jun 28 17:05:01 PDT 2006 // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #include #include "unichar.h" #include "host.h" #include "unicharmap.h" UNICHARMAP::UNICHARMAP() : nodes(0) { } UNICHARMAP::~UNICHARMAP() { if (nodes != 0) delete[] nodes; } // Search the given unichar representation in the tree. Each character in the // string is interpreted as an index in an array of nodes. UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const { const char* current_char = unichar_repr; UNICHARMAP_NODE* current_nodes = nodes; assert(*unichar_repr != '\0'); do { if (*(current_char + 1) == '\0') return current_nodes[static_cast(*current_char)].id; current_nodes = current_nodes[static_cast(*current_char)].children; ++current_char; } while (true); } // Search the given unichar representation in the tree, using length characters // from it maximum. Each character in the string is interpreted as an index in // an array of nodes. UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr, int length) const { const char* current_char = unichar_repr; UNICHARMAP_NODE* current_nodes = nodes; assert(*unichar_repr != '\0'); assert(length > 0 && length <= UNICHAR_LEN); do { if (length == 1 || *(current_char + 1) == '\0') return current_nodes[static_cast(*current_char)].id; current_nodes = current_nodes[static_cast(*current_char)].children; ++current_char; --length; } while (true); } // Search the given unichar representation in the tree, creating the possibly // missing nodes. Once the right place has been found, insert the given id and // update the inserted flag to keep track of the insert. Each character in the // string is interpreted as an index in an array of nodes. void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) { const char* current_char = unichar_repr; UNICHARMAP_NODE** current_nodes_pointer = &nodes; assert(*unichar_repr != '\0'); assert(id >= 0); do { if (*current_nodes_pointer == 0) *current_nodes_pointer = new UNICHARMAP_NODE[256]; if (*(current_char + 1) == '\0') { (*current_nodes_pointer) [static_cast(*current_char)].id = id; return; } current_nodes_pointer = &((*current_nodes_pointer) [static_cast(*current_char)].children); ++current_char; } while (true); } // Search the given unichar representation in the tree. Each character in the // string is interpreted as an index in an array of nodes. Stop once the tree // does not have anymore nodes or once we found the right unichar_repr. bool UNICHARMAP::contains(const char* const unichar_repr) const { const char* current_char = unichar_repr; UNICHARMAP_NODE* current_nodes = nodes; assert(*unichar_repr != '\0'); while (current_nodes != 0 && *(current_char + 1) != '\0') { current_nodes = current_nodes[static_cast(*current_char)].children; ++current_char; } return current_nodes != 0 && *(current_char + 1) == '\0' && current_nodes[static_cast(*current_char)].id >= 0; } // Search the given unichar representation in the tree, using length characters // from it maximum. Each character in the string is interpreted as an index in // an array of nodes. Stop once the tree does not have anymore nodes or once we // found the right unichar_repr. bool UNICHARMAP::contains(const char* const unichar_repr, int length) const { const char* current_char = unichar_repr; UNICHARMAP_NODE* current_nodes = nodes; assert(*unichar_repr != '\0'); assert(length > 0 && length <= UNICHAR_LEN); while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) { current_nodes = current_nodes[static_cast(*current_char)].children; --length; ++current_char; } return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') && current_nodes[static_cast(*current_char)].id >= 0; } // Return the minimum number of characters that must be used from this string // to obtain a match in the UNICHARMAP. int UNICHARMAP::minmatch(const char* const unichar_repr) const { const char* current_char = unichar_repr; UNICHARMAP_NODE* current_nodes = nodes; while (current_nodes != NULL && *current_char != '\0') { if (current_nodes[static_cast(*current_char)].id >= 0) return current_char + 1 - unichar_repr; current_nodes = current_nodes[static_cast(*current_char)].children; ++current_char; } return 0; } void UNICHARMAP::clear() { if (nodes != 0) { delete[] nodes; nodes = 0; } } UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(0), id(-1) { } // Recursively delete the children UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() { if (children != 0) { delete[] children; } }