src/util/pm_strpbrk.c in prism-0.22.0 vs src/util/pm_strpbrk.c in prism-0.23.0

- old
+ new

@@ -1,35 +1,60 @@ #include "prism/util/pm_strpbrk.h" /** - * This is the slow path that does care about the encoding. + * Add an invalid multibyte character error to the parser. */ +static inline void +pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start); +} + +/** + * This is the default path. + */ static inline const uint8_t * -pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) { +pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { size_t index = 0; while (index < maximum) { if (strchr((const char *) charset, source[index]) != NULL) { return source + index; } - size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); - if (width == 0) { - return NULL; - } + if (source[index] < 0x80) { + index++; + } else { + size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)); - index += width; + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } } return NULL; } /** - * This is the fast path that does not care about the encoding. + * This is the path when the encoding is ASCII-8BIT. */ static inline const uint8_t * -pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t maximum) { +pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) { size_t index = 0; while (index < maximum) { if (strchr((const char *) charset, source[index]) != NULL) { return source + index; @@ -40,10 +65,89 @@ return NULL; } /** + * This is the slow path that does care about the encoding. + */ +static inline const uint8_t * +pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80) { + index++; + } else { + size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** + * This is the fast path that does not care about the encoding because we know + * the encoding only supports single-byte characters. + */ +static inline const uint8_t * +pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80 || !validate) { + index++; + } else { + size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + + if (width > 0) { + index += width; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** * Here we have rolled our own version of strpbrk. The standard library strpbrk * has undefined behavior when the source string is not null-terminated. We want * to support strings that are not null-terminated because pm_parse does not * have the contract that the string is null-terminated. (This is desirable * because it means the extension can call pm_parse with the result of a call to @@ -55,18 +159,22 @@ * within strings, comments, regular expressions, etc. So we need to be able to * skip past them. * * Finally, we want to support encodings wherein the charset could contain * characters that are trailing bytes of multi-byte characters. For example, in - * Shift-JIS, the backslash character can be a trailing byte. In that case we + * Shift_JIS, the backslash character can be a trailing byte. In that case we * need to take a slower path and iterate one multi-byte character at a time. */ const uint8_t * -pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) { +pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) { if (length <= 0) { return NULL; - } else if (parser->encoding_changed && parser->encoding->multibyte) { - return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length); + } else if (!parser->encoding_changed) { + return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate); + } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { + return pm_strpbrk_ascii_8bit(source, charset, (size_t) length); + } else if (parser->encoding->multibyte) { + return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate); } else { - return pm_strpbrk_single_byte(source, charset, (size_t) length); + return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate); } }