src/regexp.c in prism-0.30.0 vs src/regexp.c in prism-1.0.0

- old
+ new

@@ -16,10 +16,16 @@ const uint8_t *cursor; /** A pointer to the end of the source that we are parsing. */ const uint8_t *end; + /** + * Whether or not the regular expression currently being parsed is in + * extended mode, wherein whitespace is ignored and comments are allowed. + */ + bool extended_mode; + /** Whether the encoding has changed from the default. */ bool encoding_changed; /** The encoding of the source. */ const pm_encoding_t *encoding; @@ -417,10 +423,23 @@ return false; } /** + * True if the given key is set in the options. + */ +static uint8_t +pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) { + if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { + key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM); + return options->values[key]; + } + + return false; +} + +/** * Groups can have quite a few different patterns for syntax. They basically * just wrap a set of expressions, but they can potentially have options after a * question mark. If there _isn't_ a question mark, then it's just a set of * expressions. If there _is_, then here are the options: * @@ -441,20 +460,20 @@ */ static bool pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) { const uint8_t *group_start = parser->cursor; + pm_regexp_options_t options; + pm_regexp_options_init(&options); + // First, parse any options for the group. if (pm_regexp_char_accept(parser, '?')) { if (pm_regexp_char_is_eof(parser)) { pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group"); return false; } - pm_regexp_options_t options; - pm_regexp_options_init(&options); - switch (*parser->cursor) { case '#': { // inline comments parser->cursor++; if (pm_regexp_char_is_eof(parser)) { pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group"); @@ -558,10 +577,22 @@ if (pm_regexp_char_is_eof(parser)) { return false; } + // If we are at the end of the group of options and there is no + // subexpression, then we are going to be setting the options + // for the parent group. In this case we are safe to return now. + if (*parser->cursor == ')') { + if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) { + parser->extended_mode = true; + } + + parser->cursor++; + return true; + } + // If we hit a -, then we're done parsing options. if (*parser->cursor != '-') break; // Otherwise, fallthrough to the - case. /* fallthrough */ @@ -575,27 +606,57 @@ } if (pm_regexp_char_is_eof(parser)) { return false; } + + // If we are at the end of the group of options and there is no + // subexpression, then we are going to be setting the options + // for the parent group. In this case we are safe to return now. + if (*parser->cursor == ')') { + switch (pm_regexp_options_state(&options, 'x')) { + case PM_REGEXP_OPTION_STATE_ADDED: + parser->extended_mode = true; + break; + case PM_REGEXP_OPTION_STATE_REMOVED: + parser->extended_mode = false; + break; + } + + parser->cursor++; + return true; + } + break; default: parser->cursor++; pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option"); break; } } + bool extended_mode = parser->extended_mode; + switch (pm_regexp_options_state(&options, 'x')) { + case PM_REGEXP_OPTION_STATE_ADDED: + parser->extended_mode = true; + break; + case PM_REGEXP_OPTION_STATE_REMOVED: + parser->extended_mode = false; + break; + } + // Now, parse the expressions within this group. while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') { if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) { + parser->extended_mode = extended_mode; return false; } pm_regexp_char_accept(parser, '|'); } // Finally, make sure we have a closing parenthesis. + parser->extended_mode = extended_mode; if (pm_regexp_char_expect(parser, ')')) return true; pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis"); return false; } @@ -639,10 +700,16 @@ return true; case ')': parser->cursor++; pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis"); return true; + case '#': + if (parser->extended_mode) { + if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end; + return true; + } + /* fallthrough */ default: { size_t width; if (!parser->encoding_changed) { width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); } else { @@ -700,15 +767,16 @@ /** * Parse a regular expression and extract the names of all of the named capture * groups. */ PRISM_EXPORTED_FUNCTION void -pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) { +pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) { pm_regexp_parse_pattern(&(pm_regexp_parser_t) { .parser = parser, .start = source, .cursor = source, .end = source + size, + .extended_mode = extended_mode, .encoding_changed = parser->encoding_changed, .encoding = parser->encoding, .name_callback = name_callback, .name_data = name_data, .error_callback = error_callback,