// Copyright 2010 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: jdtang@google.com (Jonathan Tang) #include "utf8.h" #include #include #include #include // For strncasecmp. #include "error.h" #include "gumbo.h" #include "parser.h" #include "util.h" #include "vector.h" const int kUtf8ReplacementChar = 0xFFFD; // Reference material: // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description // RFC 3629: http://tools.ietf.org/html/rfc3629 // HTML5 Unicode handling: // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling // Adds a decoding error to the parser's error list, based on the current state // of the Utf8Iterator. static void add_error(Utf8Iterator* iter, GumboErrorType type) { GumboParser* parser = iter->_parser; GumboError* error = gumbo_add_error(parser); if (!error) { return; } error->type = type; error->position = iter->_pos; error->original_text = iter->_start; // At the point the error is recorded, the code point hasn't been computed // yet (and can't be, because it's invalid), so we need to build up the raw // hex value from the bytes under the cursor. uint64_t code_point = 0; for (int i = 0; i < iter->_width; ++i) { code_point = (code_point << 8) | (unsigned char) iter->_start[i]; } error->v.codepoint = code_point; } // Reads the next UTF-8 character in the iter. // This assumes that iter->_start points to the beginning of the character. // When this method returns, iter->_width and iter->_current will be set // appropriately, as well as any error flags. static void read_char(Utf8Iterator* iter) { unsigned char c; unsigned char mask = '\0'; int is_bad_char = false; c = (unsigned char) *iter->_start; if (c < 0x80) { // Valid one-byte sequence. iter->_width = 1; mask = 0xFF; } else if (c < 0xC0) { // Continuation character not following a multibyte sequence. // The HTML5 spec here says to consume the byte and output a replacement // character. iter->_width = 1; is_bad_char = true; } else if (c < 0xE0) { iter->_width = 2; mask = 0x1F; // 00011111 in binary. if (c < 0xC2) { // Overlong encoding; error according to UTF8/HTML5 spec. is_bad_char = true; } } else if (c < 0xF0) { iter->_width = 3; mask = 0xF; // 00001111 in binary. } else if (c < 0xF5) { iter->_width = 4; mask = 0x7; // 00000111 in binary. } else if (c < 0xF8) { // The following cases are all errors, but we need to handle them separately // so that we consume the proper number of bytes from the input stream // before replacing them with the replacement char. The HTML5 spec // specifies that we should consume the shorter of the length specified by // the first bit or the run leading up to the first non-continuation // character. iter->_width = 5; is_bad_char = true; } else if (c < 0xFC) { iter->_width = 6; is_bad_char = true; } else if (c < 0xFE) { iter->_width = 7; is_bad_char = true; } else { iter->_width = 1; is_bad_char = true; } // Check to make sure we have enough bytes left in the iter to read all that // we want. If not, we set the iter_truncated flag, mark this as a bad // character, and adjust the current width so that it consumes the rest of the // iter. uint64_t code_point = c & mask; if (iter->_start + iter->_width > iter->_end) { iter->_width = iter->_end - iter->_start; add_error(iter, GUMBO_ERR_UTF8_TRUNCATED); is_bad_char = true; } // Now we decode continuation bytes, shift them appropriately, and build up // the appropriate code point. assert(iter->_width < 8); for (int i = 1; i < iter->_width; ++i) { c = (unsigned char) iter->_start[i]; if (c < 0x80 || c > 0xBF) { // Per HTML5 spec, we don't include the invalid continuation char in the // run that we consume here. iter->_width = i; is_bad_char = true; break; } code_point = (code_point << 6) | (c & ~0x80); } if (code_point > 0x10FFFF) is_bad_char = true; // If we had a decode error, set the current code point to the replacement // character and flip the flag indicating that a decode error occurred. // Ditto if we have a code point that is explicitly on the list of characters // prohibited by the HTML5 spec, such as control characters. if (is_bad_char || utf8_is_invalid_code_point(code_point)) { add_error(iter, GUMBO_ERR_UTF8_INVALID); code_point = kUtf8ReplacementChar; } // This is the special handling for carriage returns that is mandated by the // HTML5 spec. Since we're looking for particular 7-bit literal characters, // we operate in terms of chars and only need a check for iter overrun, // instead of having to read in a full next code point. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream if (code_point == '\r') { const char* next = iter->_start + iter->_width; if (next < iter->_end && *next == '\n') { // Advance the iter, as if the carriage return didn't exist. ++iter->_start; // Preserve the true offset, since other tools that look at it may be // unaware of HTML5's rules for converting \r into \n. ++iter->_pos.offset; } code_point = '\n'; } // At this point, we know we have a valid character as the code point, so we // set it, and we're done. iter->_current = code_point; } static void update_position(Utf8Iterator* iter) { iter->_pos.offset += iter->_width; if (iter->_current == '\n') { ++iter->_pos.line; iter->_pos.column = 1; } else if(iter->_current == '\t') { int tab_stop = iter->_parser->_options->tab_stop; iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop; } else { ++iter->_pos.column; } } // Returns true if this Unicode code point is in the list of characters // forbidden by the HTML5 spec, such as undefined control chars. bool utf8_is_invalid_code_point(int c) { return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) || (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) || ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF); } void utf8iterator_init( GumboParser* parser, const char* source, size_t source_length, Utf8Iterator* iter) { iter->_start = source; iter->_end = source + source_length; iter->_width = 0; iter->_pos.line = 1; iter->_pos.column = 1; iter->_pos.offset = 0; iter->_parser = parser; if (source_length) { read_char(iter); } else { iter->_current = -1; } } void utf8iterator_next(Utf8Iterator* iter) { if (iter->_current == -1) { // If we're already at EOF, bail out before advancing anything to avoid // reading past the end of the buffer. It's easier to catch this case here // than litter the code with lots of individual checks for EOF. return; } iter->_start += iter->_width; // We update positions based on the *last* character read, so that the first // character following a newline is at column 1 in the next line. update_position(iter); if (iter->_start < iter->_end) { read_char(iter); } else { // EOF iter->_current = -1; } } int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; } void utf8iterator_get_position( const Utf8Iterator* iter, GumboSourcePosition* output) { *output = iter->_pos; } const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) { return iter->_start; } bool utf8iterator_maybe_consume_match( Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive) { bool matched = (iter->_start + length <= iter->_end) && (case_sensitive ? !strncmp(iter->_start, prefix, length) : !strncasecmp(iter->_start, prefix, length)); if (matched) { for (int i = 0; i < length; ++i) { utf8iterator_next(iter); } return true; } else { return false; } } void utf8iterator_mark(Utf8Iterator* iter) { iter->_mark = iter->_start; iter->_mark_pos = iter->_pos; } // Returns the current input stream position to the mark. void utf8iterator_reset(Utf8Iterator* iter) { iter->_start = iter->_mark; iter->_pos = iter->_mark_pos; read_char(iter); } // Sets the position and original text fields of an error to the value at the // mark. void utf8iterator_fill_error_at_mark( Utf8Iterator* iter, GumboError* error) { error->position = iter->_mark_pos; error->original_text = iter->_mark; }