regexp.c in prism-1.0.0

- old
+ new

@@ -16,10 +16,16 @@
     const uint8_t *cursor;
 
     /** A pointer to the end of the source that we are parsing. */
     const uint8_t *end;
 
+    /**
+     * Whether or not the regular expression currently being parsed is in
+     * extended mode, wherein whitespace is ignored and comments are allowed.
+     */
+    bool extended_mode;
+
     /** Whether the encoding has changed from the default. */
     bool encoding_changed;
 
     /** The encoding of the source. */
     const pm_encoding_t *encoding;
@@ -417,10 +423,23 @@
 
     return false;
 }
 
 /**
+ * True if the given key is set in the options.
+ */
+static uint8_t
+pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
+    if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
+        key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
+        return options->values[key];
+    }
+
+    return false;
+}
+
+/**
  * Groups can have quite a few different patterns for syntax. They basically
  * just wrap a set of expressions, but they can potentially have options after a
  * question mark. If there _isn't_ a question mark, then it's just a set of
  * expressions. If there _is_, then here are the options:
  *
@@ -441,20 +460,20 @@
  */
 static bool
 pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
     const uint8_t *group_start = parser->cursor;
 
+    pm_regexp_options_t options;
+    pm_regexp_options_init(&options);
+
     // First, parse any options for the group.
     if (pm_regexp_char_accept(parser, '?')) {
         if (pm_regexp_char_is_eof(parser)) {
             pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
             return false;
         }
 
-        pm_regexp_options_t options;
-        pm_regexp_options_init(&options);
-
         switch (*parser->cursor) {
             case '#': { // inline comments
                 parser->cursor++;
                 if (pm_regexp_char_is_eof(parser)) {
                     pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
@@ -558,10 +577,22 @@
 
                 if (pm_regexp_char_is_eof(parser)) {
                     return false;
                 }
 
+                // If we are at the end of the group of options and there is no
+                // subexpression, then we are going to be setting the options
+                // for the parent group. In this case we are safe to return now.
+                if (*parser->cursor == ')') {
+                    if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
+                        parser->extended_mode = true;
+                    }
+
+                    parser->cursor++;
+                    return true;
+                }
+
                 // If we hit a -, then we're done parsing options.
                 if (*parser->cursor != '-') break;
 
                 // Otherwise, fallthrough to the - case.
                 /* fallthrough */
@@ -575,27 +606,57 @@
                 }
 
                 if (pm_regexp_char_is_eof(parser)) {
                     return false;
                 }
+
+                // If we are at the end of the group of options and there is no
+                // subexpression, then we are going to be setting the options
+                // for the parent group. In this case we are safe to return now.
+                if (*parser->cursor == ')') {
+                    switch (pm_regexp_options_state(&options, 'x')) {
+                        case PM_REGEXP_OPTION_STATE_ADDED:
+                            parser->extended_mode = true;
+                            break;
+                        case PM_REGEXP_OPTION_STATE_REMOVED:
+                            parser->extended_mode = false;
+                            break;
+                    }
+
+                    parser->cursor++;
+                    return true;
+                }
+
                 break;
             default:
                 parser->cursor++;
                 pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
                 break;
         }
     }
 
+    bool extended_mode = parser->extended_mode;
+    switch (pm_regexp_options_state(&options, 'x')) {
+        case PM_REGEXP_OPTION_STATE_ADDED:
+            parser->extended_mode = true;
+            break;
+        case PM_REGEXP_OPTION_STATE_REMOVED:
+            parser->extended_mode = false;
+            break;
+    }
+
     // Now, parse the expressions within this group.
     while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
         if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
+            parser->extended_mode = extended_mode;
             return false;
         }
         pm_regexp_char_accept(parser, '|');
     }
 
     // Finally, make sure we have a closing parenthesis.
+    parser->extended_mode = extended_mode;
     if (pm_regexp_char_expect(parser, ')')) return true;
 
     pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
     return false;
 }
@@ -639,10 +700,16 @@
             return true;
         case ')':
             parser->cursor++;
             pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
             return true;
+        case '#':
+            if (parser->extended_mode) {
+                if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
+                return true;
+            }
+        /* fallthrough */
         default: {
             size_t width;
             if (!parser->encoding_changed) {
                 width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
             } else {
@@ -700,15 +767,16 @@
 /**
  * Parse a regular expression and extract the names of all of the named capture
  * groups.
  */
 PRISM_EXPORTED_FUNCTION void
-pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
+pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
     pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
         .parser = parser,
         .start = source,
         .cursor = source,
         .end = source + size,
+        .extended_mode = extended_mode,
         .encoding_changed = parser->encoding_changed,
         .encoding = parser->encoding,
         .name_callback = name_callback,
         .name_data = name_data,
         .error_callback = error_callback,