pm_strpbrk.c in prism-0.30.0

- old
+ new

@@ -7,10 +7,31 @@
 pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
     pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
 }
 
 /**
+ * Set the explicit encoding for the parser to the current encoding.
+ */
+static inline void
+pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
+    if (parser->explicit_encoding != NULL) {
+        if (parser->explicit_encoding == parser->encoding) {
+            // Okay, we already locked to this encoding.
+        } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+            // Not okay, we already found a Unicode escape sequence and this
+            // conflicts.
+            pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
+        } else {
+            // Should not be anything else.
+            assert(false && "unreachable");
+        }
+    }
+
+    parser->explicit_encoding = parser->encoding;
+}
+
+/**
  * This is the default path.
  */
 static inline const uint8_t *
 pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
     size_t index = 0;
@@ -50,18 +71,19 @@
 
 /**
  * This is the path when the encoding is ASCII-8BIT.
  */
 static inline const uint8_t *
-pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) {
+pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
     size_t index = 0;
 
     while (index < maximum) {
         if (strchr((const char *) charset, source[index]) != NULL) {
             return source + index;
         }
 
+        if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
         index++;
     }
 
     return NULL;
 }
@@ -70,20 +92,22 @@
  * This is the slow path that does care about the encoding.
  */
 static inline const uint8_t *
 pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
     size_t index = 0;
+    const pm_encoding_t *encoding = parser->encoding;
 
     while (index < maximum) {
         if (strchr((const char *) charset, source[index]) != NULL) {
             return source + index;
         }
 
         if (source[index] < 0x80) {
             index++;
         } else {
-            size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
 
             if (width > 0) {
                 index += width;
             } else if (!validate) {
                 index++;
@@ -94,11 +118,11 @@
                 // the same kind of error.
                 const size_t start = index;
 
                 do {
                     index++;
-                } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+                } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
 
                 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
             }
         }
     }
@@ -111,20 +135,22 @@
  * the encoding only supports single-byte characters.
  */
 static inline const uint8_t *
 pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
     size_t index = 0;
+    const pm_encoding_t *encoding = parser->encoding;
 
     while (index < maximum) {
         if (strchr((const char *) charset, source[index]) != NULL) {
             return source + index;
         }
 
         if (source[index] < 0x80 || !validate) {
             index++;
         } else {
-            size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            pm_strpbrk_explicit_encoding_set(parser, source, width);
 
             if (width > 0) {
                 index += width;
             } else {
                 // At this point we know we have an invalid multibyte character.
@@ -133,11 +159,11 @@
                 // the same kind of error.
                 const size_t start = index;
 
                 do {
                     index++;
-                } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+                } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
 
                 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
             }
         }
     }
@@ -169,10 +195,10 @@
     if (length <= 0) {
         return NULL;
     } else if (!parser->encoding_changed) {
         return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
     } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
-        return pm_strpbrk_ascii_8bit(source, charset, (size_t) length);
+        return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
     } else if (parser->encoding->multibyte) {
         return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
     } else {
         return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
     }