lib/regexp-examples/parser.rb in regexp-examples-1.1.3 vs lib/regexp-examples/parser.rb in regexp-examples-1.1.4

- old
+ new

@@ -1,9 +1,26 @@ +require_relative 'parser_helpers/parse_group_helper' +require_relative 'parser_helpers/parse_after_backslash_group_helper' +require_relative 'parser_helpers/parse_multi_group_helper' +require_relative 'parser_helpers/parse_repeater_helper' +require_relative 'parser_helpers/charset_negation_helper' + +# :nodoc: module RegexpExamples IllegalSyntaxError = Class.new(StandardError) + # A Regexp parser, used to build a structured collection of objects that represents + # the regular expression. + # This object can then be used to generate strings that match the regular expression. class Parser + include ParseGroupHelper + include ParseAfterBackslashGroupHelper + include ParseMultiGroupHelper + include ParseRepeaterHelper + include CharsetNegationHelper + attr_reader :regexp_string + def initialize(regexp_string, regexp_options) @regexp_string = regexp_string @ignorecase = !(regexp_options & Regexp::IGNORECASE).zero? @multiline = !(regexp_options & Regexp::MULTILINE).zero? @extended = !(regexp_options & Regexp::EXTENDED).zero? @@ -25,313 +42,42 @@ private def parse_group(repeaters) case next_char when '(' - group = parse_multi_group + parse_multi_group when '[' - group = parse_char_group + parse_char_group when '.' - group = parse_dot_group + parse_dot_group when '|' - group = parse_or_group(repeaters) + parse_or_group(repeaters) when '\\' - group = parse_after_backslash_group + parse_after_backslash_group when '^' - group = parse_caret + parse_caret when '$' - group = parse_dollar + parse_dollar when /[#\s]/ - group = parse_extended_whitespace + parse_extended_whitespace else - group = parse_single_char_group(next_char) + parse_single_char_group(next_char) end - group end def parse_repeater(group) case next_char when '*' - repeater = parse_star_repeater(group) + parse_star_repeater(group) when '+' - repeater = parse_plus_repeater(group) + parse_plus_repeater(group) when '?' - repeater = parse_question_mark_repeater(group) + parse_question_mark_repeater(group) when '{' - repeater = parse_range_repeater(group) + parse_range_repeater(group) else - repeater = parse_one_time_repeater(group) + parse_one_time_repeater(group) end - repeater - end - - def parse_caret - if @current_position == 0 - return PlaceHolderGroup.new # Ignore the "illegal" character - else - raise_anchors_exception! - end - end - - def parse_dollar - if @current_position == (regexp_string.length - 1) - return PlaceHolderGroup.new # Ignore the "illegal" character - else - raise_anchors_exception! - end - end - - def parse_extended_whitespace - if @extended - skip_whitespace - group = PlaceHolderGroup.new # Ignore the whitespace/comment - else - group = parse_single_char_group(next_char) - end - group - end - - def skip_whitespace - whitespace_chars = rest_of_string.match(/#.*|\s+/)[0] - @current_position += whitespace_chars.length - 1 - end - - def parse_after_backslash_group - @current_position += 1 - case - when rest_of_string =~ /\A(\d{1,3})/ - @current_position += (Regexp.last_match(1).length - 1) # In case of 10+ backrefs! - group = parse_backreference_group(Regexp.last_match(1)) - when rest_of_string =~ /\Ak['<]([\w-]+)['>]/ # Named capture group - @current_position += (Regexp.last_match(1).length + 2) - group_id = if Regexp.last_match(1).to_i < 0 - # RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/ - @num_groups + Regexp.last_match(1).to_i + 1 - else - Regexp.last_match(1) - end - group = parse_backreference_group(group_id) - when BackslashCharMap.keys.include?(next_char) - group = CharGroup.new( - BackslashCharMap[next_char].dup, - @ignorecase - ) - when rest_of_string =~ /\A(c|C-)(.)/ # Control character - @current_position += Regexp.last_match(1).length - group = parse_single_char_group(parse_control_character(Regexp.last_match(2))) - when rest_of_string =~ /\Ax(\h{1,2})/ # Escape sequence - @current_position += Regexp.last_match(1).length - group = parse_single_char_group(parse_unicode_sequence(Regexp.last_match(1))) - when rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/ # Unicode sequence - @current_position += Regexp.last_match(1).length - sequence = Regexp.last_match(1).match(/\h{1,4}/)[0] # Strip off "{" and "}" - group = parse_single_char_group(parse_unicode_sequence(sequence)) - when rest_of_string =~ /\A(p)\{(\^?)([^}]+)\}/i # Named properties - @current_position += (Regexp.last_match(2).length + # 0 or 1, of '^' is present - Regexp.last_match(3).length + # Length of the property name - 2) # Length of opening and closing brackets (always 2) - # Beware of double negatives! E.g. /\P{^Space}/ - is_negative = (Regexp.last_match(1) == 'P') ^ (Regexp.last_match(2) == '^') - group = CharGroup.new( - if is_negative - CharSets::Any.dup - NamedPropertyCharMap[Regexp.last_match(3).downcase] - else - NamedPropertyCharMap[Regexp.last_match(3).downcase] - end, - @ignorecase - ) - when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!) - group = PlaceHolderGroup.new - when next_char == 'R' # Linebreak - group = CharGroup.new( - ["\r\n", "\n", "\v", "\f", "\r"], - @ignorecase - ) # Using "\r\n" as one character is little bit hacky... - when next_char == 'g' # Subexpression call - fail IllegalSyntaxError, - 'Subexpression calls (\\g) cannot be supported, as they are not regular' - when next_char =~ /[bB]/ # Anchors - raise_anchors_exception! - when next_char =~ /[AG]/ # Start of string - if @current_position == 1 - group = PlaceHolderGroup.new - else - raise_anchors_exception! - end - when next_char =~ /[zZ]/ # End of string - if @current_position == (regexp_string.length - 1) - # TODO: /\Z/ should be treated as /\n?/ - group = PlaceHolderGroup.new - else - raise_anchors_exception! - end - else - group = parse_single_char_group(next_char) - end - group - end - - def parse_multi_group - @current_position += 1 - @num_groups += 1 - remember_old_regexp_options do - group_id = nil # init - rest_of_string.match( - / - \A - (\?)? # Is it a "special" group, i.e. starts with a "?"? - ( - : # Non capture group - |! # Neglookahead - |= # Lookahead - |\# # Comment group - |< # Lookbehind or named capture - ( - ! # Neglookbehind - |= # Lookbehind - |[^>]+ # Named capture - ) - |[mix]*-?[mix]* # Option toggle - )? - /x - ) do |match| - case - when match[1].nil? # e.g. /(normal)/ - group_id = @num_groups.to_s - when match[2] == ':' # e.g. /(?:nocapture)/ - @current_position += 2 - when match[2] == '#' # e.g. /(?#comment)/ - comment_group = rest_of_string.match(/.*?[^\\](?:\\{2})*\)/)[0] - @current_position += comment_group.length - when match[2] =~ /\A(?=[mix-]+)([mix]*)-?([mix]*)/ # e.g. /(?i-mx)/ - regexp_options_toggle(Regexp.last_match(1), Regexp.last_match(2)) - @num_groups -= 1 # Toggle "groups" should not increase backref group count - @current_position += $&.length + 1 - if next_char == ':' # e.g. /(?i:subexpr)/ - @current_position += 1 - else - return PlaceHolderGroup.new - end - when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/ - fail IllegalSyntaxError, - 'Lookaheads are not regular; cannot generate examples' - when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/ - fail IllegalSyntaxError, - 'Lookbehinds are not regular; cannot generate examples' - else # e.g. /(?<name>namedgroup)/ - @current_position += (match[3].length + 3) - group_id = match[3] - end - end - MultiGroup.new(parse, group_id) - end - end - - def remember_old_regexp_options - previous_ignorecase = @ignorecase - previous_multiline = @multiline - previous_extended = @extended - group = yield - @ignorecase = previous_ignorecase - @multiline = previous_multiline - @extended = previous_extended - group - end - - def regexp_options_toggle(on, off) - regexp_option_toggle(on, off, '@ignorecase', 'i') - regexp_option_toggle(on, off, '@multiline', 'm') - regexp_option_toggle(on, off, '@extended', 'x') - end - - def regexp_option_toggle(on, off, var, char) - instance_variable_set(var, true) if on.include? char - instance_variable_set(var, false) if off.include? char - end - - def parse_char_group - @current_position += 1 # Skip past opening "[" - chargroup_parser = ChargroupParser.new(rest_of_string) - parsed_chars = chargroup_parser.result - @current_position += (chargroup_parser.length - 1) # Step back to closing "]" - CharGroup.new(parsed_chars, @ignorecase) - end - - def parse_dot_group - DotGroup.new(@multiline) - end - - def parse_or_group(left_repeaters) - @current_position += 1 - right_repeaters = parse - OrGroup.new(left_repeaters, right_repeaters) - end - - def parse_single_char_group(char) - SingleCharGroup.new(char, @ignorecase) - end - - def parse_backreference_group(group_id) - BackReferenceGroup.new(group_id) - end - - def parse_control_character(char) - (char.ord % 32).chr # Black magic! - # eval "?\\C-#{char.chr}" # Doesn't work for e.g. char = "?" - end - - def parse_unicode_sequence(match) - [match.to_i(16)].pack('U') - end - - def parse_star_repeater(group) - @current_position += 1 - parse_reluctant_or_possessive_repeater - StarRepeater.new(group) - end - - def parse_plus_repeater(group) - @current_position += 1 - parse_reluctant_or_possessive_repeater - PlusRepeater.new(group) - end - - def parse_reluctant_or_possessive_repeater - if next_char =~ /[?+]/ - # Don't treat these repeaters any differently when generating examples - @current_position += 1 - end - end - - def parse_question_mark_repeater(group) - @current_position += 1 - parse_reluctant_or_possessive_repeater - QuestionMarkRepeater.new(group) - end - - def parse_range_repeater(group) - match = rest_of_string.match(/\A\{(\d+)?(,)?(\d+)?\}/) - @current_position += match[0].size - min = match[1].to_i if match[1] - has_comma = !match[2].nil? - max = match[3].to_i if match[3] - repeater = RangeRepeater.new(group, min, has_comma, max) - parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max) - end - - def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max) - # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier" - if min && !has_comma && !max && next_char == '?' - repeater = parse_question_mark_repeater(repeater) - else - parse_reluctant_or_possessive_repeater - end - repeater - end - - def raise_anchors_exception! - fail IllegalSyntaxError, - "Anchors ('#{next_char}') cannot be supported, as they are not regular" end def parse_one_time_repeater(group) OneTimeRepeater.new(group) end